From 1f4c0311cda3403999b702c996898af5707973a9 Mon Sep 17 00:00:00 2001 From: David Tellenbach Date: Sun, 18 Apr 2021 23:43:12 +0200 Subject: [PATCH 001/266] Bump to 3.3.91 (3.4-rc1) --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index d2e844928..5862c5ebd 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -17,7 +17,7 @@ #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 3 -#define EIGEN_MINOR_VERSION 90 +#define EIGEN_MINOR_VERSION 91 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ -- GitLab From ab7fe215f9c03e68c12490c6b191c2e1a8878ffb Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 14 Apr 2021 13:54:11 -0700 Subject: [PATCH 002/266] Fix ldexp for AVX512 (#2215) Wrong shuffle was used. Need to interleave low/high halves with a `permute` instruction. Fixes #2215. (cherry picked from commit 1d79c68ba0507574d893780e60b982f07d210261) --- Eigen/src/Core/arch/AVX512/PacketMath.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index f8741372d..9307c6763 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -929,7 +929,8 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons Packet8i b = parithmetic_shift_right<2>(e); // floor(e/4) // 2^b - Packet8i hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0)); + const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); Packet8i lo = _mm256_slli_epi64(hi, 52); hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); @@ -937,7 +938,7 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons // 2^(e - 3b) b = psub(psub(psub(e, b), b), b); // e - 3b - hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0)); + hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); lo = _mm256_slli_epi64(hi, 52); hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); -- GitLab From 28564957accbf142d99779ca15119a65561a500c Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Tue, 20 Apr 2021 12:01:45 -0500 Subject: [PATCH 003/266] Fix taking address of rvalue compiler issue with TensorFlow (plus other warnings). (cherry picked from commit 06c2760bd1139711eeffa30266ead43423891698) --- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 26 +++++++++---------- .../src/Core/arch/AltiVec/MatrixProductMMA.h | 10 +++---- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index e3ba06159..dbdb81ef1 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -493,21 +493,21 @@ struct dhs_cpack { cblock.packet[1] = lhs.template loadPacket(i, j + 2); } } else { - const std::complex *lhs0, *lhs1; + std::complex lhs0, lhs1; if (UseLhs) { - lhs0 = &lhs(j + 0, i); - lhs1 = &lhs(j + 1, i); - cblock.packet[0] = pload2(lhs0, lhs1); - lhs0 = &lhs(j + 2, i); - lhs1 = &lhs(j + 3, i); - cblock.packet[1] = pload2(lhs0, lhs1); + lhs0 = lhs(j + 0, i); + lhs1 = lhs(j + 1, i); + cblock.packet[0] = pload2(&lhs0, &lhs1); + lhs0 = lhs(j + 2, i); + lhs1 = lhs(j + 3, i); + cblock.packet[1] = pload2(&lhs0, &lhs1); } else { - lhs0 = &lhs(i, j + 0); - lhs1 = &lhs(i, j + 1); - cblock.packet[0] = pload2(lhs0, lhs1); - lhs0 = &lhs(i, j + 2); - lhs1 = &lhs(i, j + 3); - cblock.packet[1] = pload2(lhs0, lhs1); + lhs0 = lhs(i, j + 0); + lhs1 = lhs(i, j + 1); + cblock.packet[0] = pload2(&lhs0, &lhs1); + lhs0 = lhs(i, j + 2); + lhs1 = lhs(i, j + 3); + cblock.packet[1] = pload2(&lhs0, &lhs1); } } diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 8edf79c4b..08855bd01 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -99,11 +99,9 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, con } template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet4f& b) +EIGEN_STRONG_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&) { - EIGEN_UNUSED_VARIABLE(acc); // Just for compilation - EIGEN_UNUSED_VARIABLE(a); - EIGEN_UNUSED_VARIABLE(b); + // Just for compilation } template @@ -150,11 +148,9 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const double* rhs, _ } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV) +EIGEN_STRONG_INLINE void ploadRhsMMA(const float*, __vector_pair&) { // Just for compilation - EIGEN_UNUSED_VARIABLE(rhs); - EIGEN_UNUSED_VARIABLE(rhsV); } // PEEL_MMA loop factor. -- GitLab From 42a8bdd4d70476fd0a768ee582313703e699bed6 Mon Sep 17 00:00:00 2001 From: Jakub Lichman Date: Tue, 20 Apr 2021 11:14:56 +0000 Subject: [PATCH 004/266] HasExp added for AVX512 Packet8d (cherry picked from commit 2b1dfd1ba0638e57a50d2f401412e0893064c354) --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 69 +--------------------- Eigen/src/Core/arch/AVX512/PacketMath.h | 1 + 2 files changed, 4 insertions(+), 66 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 41929cb34..6fd726d29 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -119,74 +119,11 @@ pexp(const Packet16f& _x) { return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x); } -/*template <> +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d pexp(const Packet8d& _x) { - Packet8d x = _x; - - _EIGEN_DECLARE_CONST_Packet8d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet8d(2, 2.0); - - _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6); - - // clamp x - x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo); - - // Express exp(x) as exp(g + n*log(2)). - const Packet8d n = - _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1); - const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2); - x = psub(x, nC1); - x = psub(x, nC2); - - const Packet8d x2 = pmul(x, x); - - // Evaluate the numerator polynomial of the rational interpolant. - Packet8d px = p8d_cephes_exp_p0; - px = pmadd(px, x2, p8d_cephes_exp_p1); - px = pmadd(px, x2, p8d_cephes_exp_p2); - px = pmul(px, x); - - // Evaluate the denominator polynomial of the rational interpolant. - Packet8d qx = p8d_cephes_exp_q0; - qx = pmadd(qx, x2, p8d_cephes_exp_q1); - qx = pmadd(qx, x2, p8d_cephes_exp_q2); - qx = pmadd(qx, x2, p8d_cephes_exp_q3); - - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm512_div_pd(px, psub(qx, px)); - x = pmadd(p8d_2, x, p8d_1); - - // Build e=2^n. - const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64( - _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52)); - - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, e), _x); - }*/ + return pexp_double(_x); +} F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 9307c6763..7d3362f48 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -140,6 +140,7 @@ template<> struct packet_traits : default_packet_traits HasHalfPacket = 1, #if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) HasLog = 1, + HasExp = 1, HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, #endif -- GitLab From 34d0be9ec1a1b2d4f18a9aa2bab56000c168186c Mon Sep 17 00:00:00 2001 From: Jakub Lichman Date: Mon, 19 Apr 2021 13:46:38 +0000 Subject: [PATCH 005/266] Compilation of basicbenchmark fixed (cherry picked from commit d72c794ccd21637ba56dec0dd8bd0cffef7bc47e) --- bench/basicbenchmark.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bench/basicbenchmark.h b/bench/basicbenchmark.h index 3fdc35732..8059375b5 100644 --- a/bench/basicbenchmark.h +++ b/bench/basicbenchmark.h @@ -16,13 +16,13 @@ void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations) { asm("#begin_bench_loop LazyEval"); if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize"); - m = (I + 0.00005 * (m + m.lazy() * m)).eval(); + m = (I + 0.00005 * (m + m.lazyProduct(m))).eval(); } else if (Mode==OmpEval) { asm("#begin_bench_loop OmpEval"); if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize"); - m = (I + 0.00005 * (m + m.lazy() * m)).evalOMP(); + m = (I + 0.00005 * (m + m.lazyProduct(m))).eval(); } else { -- GitLab From 54425a39b2ecac3d4e44ce836cf5ee3f44b94767 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 22 Apr 2021 15:21:01 +0000 Subject: [PATCH 006/266] Make vectorized compute_inverse_size4 compile with AVX. (cherry picked from commit 85a76a16ea835fcfa7d4c185a338ae2aef9a272a) --- Eigen/LU | 4 +-- Eigen/src/LU/arch/InverseSize4.h | 45 +++++++++++++++++--------------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/Eigen/LU b/Eigen/LU index 0fb184bcb..1236ceb04 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -38,9 +38,7 @@ #include "src/LU/Determinant.h" #include "src/LU/InverseImpl.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if (defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX) || defined EIGEN_VECTORIZE_NEON +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON #include "src/LU/arch/InverseSize4.h" #endif diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index 5a8d0c114..ee5548aed 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -54,10 +54,12 @@ struct compute_inverse_size4(0); - Packet4f _L2 = matrix.template packet(4); - Packet4f _L3 = matrix.template packet(8); - Packet4f _L4 = matrix.template packet(12); + const float* data = matrix.data(); + const Index stride = matrix.innerStride(); + Packet4f _L1 = ploadt(data); + Packet4f _L2 = ploadt(data + stride*4); + Packet4f _L3 = ploadt(data + stride*8); + Packet4f _L4 = ploadt(data + stride*12); // Four 2x2 sub-matrices of the input matrix // input = [[A, B], @@ -189,25 +191,26 @@ struct compute_inverse_size4(0); - B1 = matrix.template packet(2); - A2 = matrix.template packet(4); - B2 = matrix.template packet(6); - C1 = matrix.template packet(8); - D1 = matrix.template packet(10); - C2 = matrix.template packet(12); - D2 = matrix.template packet(14); + A1 = ploadt(data + stride*0); + B1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + B2 = ploadt(data + stride*6); + C1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + C2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); } else { Packet2d temp; - A1 = matrix.template packet(0); - C1 = matrix.template packet(2); - A2 = matrix.template packet(4); - C2 = matrix.template packet(6); - + A1 = ploadt(data + stride*0); + C1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + C2 = ploadt(data + stride*6); temp = A1; A1 = vec2d_unpacklo(A1, A2); A2 = vec2d_unpackhi(temp, A2); @@ -216,10 +219,10 @@ struct compute_inverse_size4(8); - D1 = matrix.template packet(10); - B2 = matrix.template packet(12); - D2 = matrix.template packet(14); + B1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + B2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); temp = B1; B1 = vec2d_unpacklo(B1, B2); -- GitLab From 8830d66c02f80f2034550594576e9311fd6edfff Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 21 Apr 2021 15:45:31 -0700 Subject: [PATCH 007/266] DenseStorage safely copy/swap. Fixes #2229. For dynamic matrices with fixed-sized storage, only copy/swap elements that have been set. Otherwise, this leads to inefficient copying, and potential UB for non-initialized elements. (cherry picked from commit d213a0bcea2344aa3f6c9856da9f5b2a26ccec25) --- Eigen/src/Core/DenseStorage.h | 61 +++++++++++--- Eigen/src/Core/util/Memory.h | 11 +++ test/SafeScalar.h | 30 +++++++ test/dense_storage.cpp | 144 +++++++++++++++++++++++++--------- test/rvalue_types.cpp | 31 +------- 5 files changed, 199 insertions(+), 78 deletions(-) create mode 100644 test/SafeScalar.h diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index f6e1d0af1..9acca6c90 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -163,6 +163,30 @@ struct plain_array EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; +struct plain_array_helper { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void copy(const plain_array& src, const Eigen::Index size, + plain_array& dst) { + smart_copy(src.array, src.array + size, dst.array); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void swap(plain_array& a, const Eigen::Index a_size, + plain_array& b, const Eigen::Index b_size) { + if (a_size < b_size) { + std::swap_ranges(b.array, b.array + a_size, a.array); + smart_move(b.array + a_size, b.array + b_size, a.array + a_size); + } else if (a_size > b_size) { + std::swap_ranges(a.array, a.array + b_size, b.array); + smart_move(a.array + b_size, a.array + a_size, b.array + b_size); + } else { + std::swap_ranges(a.array, a.array + a_size, b.array); + } + } +}; + } // end namespace internal /** \internal @@ -268,21 +292,25 @@ template class DenseStorage class DenseStorage class DenseStorage struct smart_memmove_helper { } }; +#if EIGEN_HAS_RVALUE_REFERENCES +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::move(start, end, target); +} +#else +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::copy(start, end, target); +} +#endif /***************************************************************************** *** Implementation of runtime stack allocation (falling back to malloc) *** diff --git a/test/SafeScalar.h b/test/SafeScalar.h new file mode 100644 index 000000000..c5cb75717 --- /dev/null +++ b/test/SafeScalar.h @@ -0,0 +1,30 @@ + +// A Scalar that asserts for uninitialized access. +template +class SafeScalar { + public: + SafeScalar() : initialized_(false) {} + SafeScalar(const SafeScalar& other) { + *this = other; + } + SafeScalar& operator=(const SafeScalar& other) { + val_ = T(other); + initialized_ = true; + return *this; + } + + SafeScalar(T val) : val_(val), initialized_(true) {} + SafeScalar& operator=(T val) { + val_ = val; + initialized_ = true; + } + + operator T() const { + VERIFY(initialized_ && "Uninitialized access."); + return val_; + } + + private: + T val_; + bool initialized_; +}; diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp index 7fa25859d..36ccbb02c 100644 --- a/test/dense_storage.cpp +++ b/test/dense_storage.cpp @@ -8,17 +8,16 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include "main.h" +#include "AnnoyingScalar.h" +#include "SafeScalar.h" #include -template -void dense_storage_copy() +template +void dense_storage_copy(int rows, int cols) { - static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols); - typedef DenseStorage DenseStorageType; + typedef DenseStorage DenseStorageType; - const int rows = (Rows==Dynamic) ? 4 : Rows; - const int cols = (Cols==Dynamic) ? 3 : Cols; const int size = rows*cols; DenseStorageType reference(size, rows, cols); T* raw_reference = reference.data(); @@ -31,14 +30,11 @@ void dense_storage_copy() VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]); } -template -void dense_storage_assignment() +template +void dense_storage_assignment(int rows, int cols) { - static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols); - typedef DenseStorage DenseStorageType; + typedef DenseStorage DenseStorageType; - const int rows = (Rows==Dynamic) ? 4 : Rows; - const int cols = (Cols==Dynamic) ? 3 : Cols; const int size = rows*cols; DenseStorageType reference(size, rows, cols); T* raw_reference = reference.data(); @@ -52,6 +48,34 @@ void dense_storage_assignment() VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]); } +template +void dense_storage_swap(int rows0, int cols0, int rows1, int cols1) +{ + typedef DenseStorage DenseStorageType; + + const int size0 = rows0*cols0; + DenseStorageType a(size0, rows0, cols0); + for (int i=0; i(i); + } + + const int size1 = rows1*cols1; + DenseStorageType b(size1, rows1, cols1); + for (int i=0; i(-i); + } + + a.swap(b); + + for (int i=0; i(i)); + } + + for (int i=0; i(-i)); + } +} + template void dense_storage_alignment() { @@ -78,30 +102,78 @@ void dense_storage_alignment() #endif } -EIGEN_DECLARE_TEST(dense_storage) -{ - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); - - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); +template +void dense_storage_tests() { + // Dynamic Storage. + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + // Fixed Storage. + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + // Fixed Storage with Uninitialized Elements. + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); - - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); + // Dynamic Storage. + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + // Fixed Storage. + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + // Fixed Storage with Uninitialized Elements. + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + + // Dynamic Storage. + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 1); + dense_storage_swap(2, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 3); + dense_storage_swap(2, 3, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 4, 1); + dense_storage_swap(4, 1, 4, 3); + // Fixed Storage. + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 1); + dense_storage_swap(2, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 4, 1); + dense_storage_swap(4, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 3); + dense_storage_swap(2, 3, 4, 3); + // Fixed Storage with Uninitialized Elements. + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 1); + dense_storage_swap(2, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 4, 1); + dense_storage_swap(4, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 3); + dense_storage_swap(2, 3, 4, 3); + + dense_storage_alignment(); + dense_storage_alignment(); + dense_storage_alignment(); + dense_storage_alignment(); +} - dense_storage_alignment(); - dense_storage_alignment(); - dense_storage_alignment(); - dense_storage_alignment(); +EIGEN_DECLARE_TEST(dense_storage) +{ + dense_storage_tests(); + dense_storage_tests(); + dense_storage_tests >(); + dense_storage_tests(); } diff --git a/test/rvalue_types.cpp b/test/rvalue_types.cpp index c20a32f79..2c9999ce8 100644 --- a/test/rvalue_types.cpp +++ b/test/rvalue_types.cpp @@ -13,41 +13,12 @@ #if EIGEN_HAS_CXX11 #include "MovableScalar.h" #endif +#include "SafeScalar.h" #include using internal::UIntPtr; -// A Scalar that asserts for uninitialized access. -template -class SafeScalar { - public: - SafeScalar() : initialized_(false) {} - SafeScalar(const SafeScalar& other) { - *this = other; - } - SafeScalar& operator=(const SafeScalar& other) { - val_ = T(other); - initialized_ = true; - return *this; - } - - SafeScalar(T val) : val_(val), initialized_(true) {} - SafeScalar& operator=(T val) { - val_ = val; - initialized_ = true; - } - - operator T() const { - VERIFY(initialized_ && "Uninitialized access."); - return val_; - } - - private: - T val_; - bool initialized_; -}; - #if EIGEN_HAS_RVALUE_REFERENCES template void rvalue_copyassign(const MatrixType& m) -- GitLab From 587a6915169101c0e65c88cbf54f704a63f0a2a4 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 21 Apr 2021 11:03:23 -0700 Subject: [PATCH 008/266] Check existence of BSD random before use. `TensorRandom` currently relies on BSD `random()`, which is not always available. The [linux manpage](https://man7.org/linux/man-pages/man3/srandom.3.html) gives the glibc condition: ``` _XOPEN_SOURCE >= 500 || /* Glibc since 2.19: */ _DEFAULT_SOURCE || /* Glibc <= 2.19: */ _SVID_SOURCE || _BSD_SOURCE ``` In particular, this was failing to compile for MinGW via msys2. If not available, we fall back to using `rand()`. (cherry picked from commit 045c0609b5c059974104f29dad91bcc3828e91ac) --- .../Eigen/CXX11/src/Tensor/TensorRandom.h | 34 +++++++++++-------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 13450e1a7..9a20b53bb 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -34,9 +34,9 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { // we try to generate seeds faster than the clock resolution. // We need 2 random values since the generator only generate 16 bits at // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; + unsigned rnd1 = static_cast(::rand()); + unsigned rnd2 = static_cast(::rand()); + uint64_t rnd = (rnd1 ^ (rnd2 << 16)) ^ time; return rnd; #elif defined __APPLE__ @@ -45,23 +45,29 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { uint64_t rnd = ::random() ^ mach_absolute_time(); return rnd; -#elif defined __native_client__ - // Same approach as for win32, except using clock_gettime - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ ts.tv_nsec; - return rnd; - #else // Augment the current time with pseudo random number generation // to ensure that we get different seeds if we try to generate seeds // faster than the clock resolution. timespec ts; clock_gettime(CLOCK_REALTIME, &ts); - uint64_t rnd = ::random() ^ ts.tv_nsec; - return rnd; + + + // Check for BSD random(). +#if EIGEN_COMP_GNUC && (\ + defined(_XOPEN_SOURCE) && _XOPEN_SOURCE >= 500 \ + || /* Glibc since 2.19: */ (defined(_DEFAULT_SOURCE) && _DEFAULT_SOURCE) \ + || /* Glibc <= 2.19: */ (defined(_SVID_SOURCE) && _SVID_SOURCE) \ + || (defined(_BSD_SOURCE) && _BSD_SOURCE) \ + ) + uint64_t rnd = ::random(); +#else + // Build random from rand() + unsigned rnd1 = static_cast(::rand()); + unsigned rnd2 = static_cast(::rand()); + uint64_t rnd = (rnd1 ^ (rnd2 << 16)); +#endif + return rnd ^ ts.tv_nsec; #endif } -- GitLab From baf601a0e392078428c2a8c14f66edc18989b946 Mon Sep 17 00:00:00 2001 From: Turing Eret Date: Fri, 23 Apr 2021 07:43:35 -0600 Subject: [PATCH 009/266] Fix for issue with static global variables in TensorDeviceGpu.h m_deviceProperties and m_devicePropInitialized are defined as global statics which will define multiple copies which can cause issues if initializeDeviceProp() is called in one translation unit and then m_deviceProperties is used in a different translation unit. Added inline functions getDeviceProperties() and getDevicePropInitialized() which defines those variables as static locals. As per the C++ standard 7.1.2/4, a static local declared in an inline function always refers to the same object, so this should be safer. Credit to Sun Chenggen for this fix. This fixes issue #1475. (cherry picked from commit 3804ca0d905a0a03357db50abc7468f5f90abc98) --- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index 9422dcd7a..d5eff9dc4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -42,11 +42,18 @@ class StreamInterface { virtual unsigned int* semaphore() const = 0; }; -static gpuDeviceProp_t* m_deviceProperties; -static bool m_devicePropInitialized = false; +EIGEN_STRONG_INLINE gpuDeviceProp_t*& getDeviceProperties() { + static gpuDeviceProp_t* deviceProperties; + return deviceProperties; +} + +EIGEN_STRONG_INLINE bool& getDevicePropInitialized() { + static bool devicePropInitialized = false; + return devicePropInitialized; +} static void initializeDeviceProp() { - if (!m_devicePropInitialized) { + if (!getDevicePropInitialized()) { // Attempts to ensure proper behavior in the case of multiple threads // calling this function simultaneously. This would be trivial to // implement if we could use std::mutex, but unfortunately mutex don't @@ -64,9 +71,9 @@ static void initializeDeviceProp() { << std::endl; gpu_assert(status == gpuSuccess); } - m_deviceProperties = new gpuDeviceProp_t[num_devices]; + getDeviceProperties() = new gpuDeviceProp_t[num_devices]; for (int i = 0; i < num_devices; ++i) { - status = gpuGetDeviceProperties(&m_deviceProperties[i], i); + status = gpuGetDeviceProperties(&getDeviceProperties()[i], i); if (status != gpuSuccess) { std::cerr << "Failed to initialize GPU device #" << i @@ -78,10 +85,10 @@ static void initializeDeviceProp() { } std::atomic_thread_fence(std::memory_order_release); - m_devicePropInitialized = true; + getDevicePropInitialized() = true; } else { // Wait for the other thread to inititialize the properties. - while (!m_devicePropInitialized) { + while (!getDevicePropInitialized()) { std::atomic_thread_fence(std::memory_order_acquire); std::this_thread::sleep_for(std::chrono::milliseconds(1000)); } @@ -129,7 +136,7 @@ class GpuStreamDevice : public StreamInterface { const gpuStream_t& stream() const { return *stream_; } const gpuDeviceProp_t& deviceProperties() const { - return m_deviceProperties[device_]; + return getDeviceProperties()[device_]; } virtual void* allocate(size_t num_bytes) const { gpuError_t err = gpuSetDevice(device_); -- GitLab From 63abb10000c36892ddfcdd48de1d318c10c052bf Mon Sep 17 00:00:00 2001 From: Jakub Lichman Date: Fri, 23 Apr 2021 19:51:43 +0000 Subject: [PATCH 010/266] Tests for pcmp_lt and pcmp_le added (cherry picked from commit 1115f5462ecaa84d3c60479f7e23a530a1a415d2) --- test/packetmath.cpp | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 67d329a67..79e91c819 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -279,10 +279,45 @@ void packetmath_boolean_mask_ops() { CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq); } +template +void packetmath_boolean_mask_ops_notcomplex() { + const int PacketSize = internal::unpacket_traits::size; + const int size = 2 * PacketSize; + EIGEN_ALIGN_MAX Scalar data1[size]; + EIGEN_ALIGN_MAX Scalar data2[size]; + EIGEN_ALIGN_MAX Scalar ref[size]; + + for (int i = 0; i < PacketSize; ++i) { + data1[i] = internal::random(); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + + CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le); + CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt); + + //Test (-0) <=/< (0) for signed operations + for (int i = 0; i < PacketSize; ++i) { + data1[i] = Scalar(-0.0); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le); + CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt); + + //Test NaN + for (int i = 0; i < PacketSize; ++i) { + data1[i] = NumTraits::quiet_NaN(); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le); + CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt); +} + // Packet16b representing bool does not support ptrue, pandnot or pcmp_eq, since the scalar path // (for some compilers) compute the bitwise and with 0x1 of the results to keep the value in [0,1]. template<> void packetmath_boolean_mask_ops::type>() {} +template<> +void packetmath_boolean_mask_ops_notcomplex::type>() {} template void packetmath_minus_zero_add() { @@ -1020,6 +1055,8 @@ void packetmath_notcomplex() { CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_nan_min, (internal::pmin)); CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_nan_max, internal::pmax); } + + packetmath_boolean_mask_ops_notcomplex(); } template -- GitLab From ac3c5aad31ba45ddabe9b74f6abe4a4c033da1f9 Mon Sep 17 00:00:00 2001 From: Jakub Lichman Date: Sun, 25 Apr 2021 20:58:56 +0000 Subject: [PATCH 011/266] Tests added and AVX512 bug fixed for pcmp_lt_or_nan (cherry picked from commit d87648a6bea315645b893c3815ca8c6bb00ec5d2) --- Eigen/src/Core/arch/AVX512/PacketMath.h | 4 ++-- test/packetmath.cpp | 32 +++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 7d3362f48..59bbef0d1 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -487,7 +487,7 @@ template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packe } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ); return _mm512_castsi512_ps( _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } @@ -518,7 +518,7 @@ EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) { } template <> EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) { - __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGT_UQ); + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ); return _mm512_castsi512_pd( _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); } diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 79e91c819..0bb511d5a 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -279,6 +279,36 @@ void packetmath_boolean_mask_ops() { CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq); } +template +void packetmath_boolean_mask_ops_real() { + const int PacketSize = internal::unpacket_traits::size; + const int size = 2 * PacketSize; + EIGEN_ALIGN_MAX Scalar data1[size]; + EIGEN_ALIGN_MAX Scalar data2[size]; + EIGEN_ALIGN_MAX Scalar ref[size]; + + for (int i = 0; i < PacketSize; ++i) { + data1[i] = internal::random(); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + + CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan); + + //Test (-0) <=/< (0) for signed operations + for (int i = 0; i < PacketSize; ++i) { + data1[i] = Scalar(-0.0); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan); + + //Test NaN + for (int i = 0; i < PacketSize; ++i) { + data1[i] = NumTraits::quiet_NaN(); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan); +} + template void packetmath_boolean_mask_ops_notcomplex() { const int PacketSize = internal::unpacket_traits::size; @@ -609,6 +639,8 @@ void packetmath_real() { CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil); CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor); CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print); + + packetmath_boolean_mask_ops_real(); // Rounding edge cases. if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) { -- GitLab From 83df5df61b79379e9989a7d23cfba47d97a7f819 Mon Sep 17 00:00:00 2001 From: Theo Fletcher Date: Mon, 26 Apr 2021 16:52:44 +0100 Subject: [PATCH 012/266] Added complex matrix unit tests for SelfAdjointEigenSolve (cherry picked from commit 2ced0cc233fff6ef16c4d098b03aeeb69ff7c509) --- test/eigensolver_selfadjoint.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp index 65b80c3fb..0fb2f4da7 100644 --- a/test/eigensolver_selfadjoint.cpp +++ b/test/eigensolver_selfadjoint.cpp @@ -234,15 +234,21 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) { int s = 0; for(int i = 0; i < g_repeat; i++) { + // trivial test for 1x1 matrices: CALL_SUBTEST_1( selfadjointeigensolver(Matrix())); CALL_SUBTEST_1( selfadjointeigensolver(Matrix())); + CALL_SUBTEST_1( selfadjointeigensolver(Matrix, 1, 1>())); + // very important to test 3x3 and 2x2 matrices since we provide special paths for them CALL_SUBTEST_12( selfadjointeigensolver(Matrix2f()) ); CALL_SUBTEST_12( selfadjointeigensolver(Matrix2d()) ); + CALL_SUBTEST_12( selfadjointeigensolver(Matrix2cd()) ); CALL_SUBTEST_13( selfadjointeigensolver(Matrix3f()) ); CALL_SUBTEST_13( selfadjointeigensolver(Matrix3d()) ); + CALL_SUBTEST_13( selfadjointeigensolver(Matrix3cd()) ); CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) ); + CALL_SUBTEST_2( selfadjointeigensolver(Matrix4cd()) ); s = internal::random(1,EIGEN_TEST_MAX_SIZE/4); CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) ); @@ -254,6 +260,8 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) // some trivial but implementation-wise tricky cases CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(1,1)) ); CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(2,2)) ); + CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(1,1)) ); + CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(2,2)) ); CALL_SUBTEST_6( selfadjointeigensolver(Matrix()) ); CALL_SUBTEST_7( selfadjointeigensolver(Matrix()) ); } -- GitLab From a33855f6ee56463b4bb217b34d077e00999a90a2 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 27 Apr 2021 14:12:11 -0700 Subject: [PATCH 013/266] Add missing pcmp_lt_or_nan for NEON Packet4bf. (cherry picked from commit 172db7bfc32def5ed0f885287e352b63dd5cd767) --- Eigen/src/Core/arch/NEON/PacketMath.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 9cf4e0712..2b48570d1 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -3701,6 +3701,11 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt(const Packet4bf& a, return F32MaskToBf16Mask(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); } +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le(const Packet4bf& a, const Packet4bf& b) { return F32MaskToBf16Mask(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); -- GitLab From fc2cc10842dc084b638aadb562f1064087691753 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 23 Apr 2021 16:04:01 -0700 Subject: [PATCH 014/266] Better CUDA complex division. The original produced NaNs when dividing 0/b for subnormal b. The `complex_divide_stable` was changed to use the more common Smith's algorithm. (cherry picked from commit 1c013be2cc6a999268be2f25575cd6a07bd52c45) --- Eigen/src/Core/arch/CUDA/Complex.h | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h index b1618e567..deb4c8694 100644 --- a/Eigen/src/Core/arch/CUDA/Complex.h +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -67,27 +67,26 @@ std::complex complex_divide_fast(const std::complex& a, const std::complex const T a_imag = numext::imag(a); const T b_real = numext::real(b); const T b_imag = numext::imag(b); - const T norm = T(1) / (b_real * b_real + b_imag * b_imag); - return std::complex((a_real * b_real + a_imag * b_imag) * norm, - (a_imag * b_real - a_real * b_imag) * norm); + const T norm = (b_real * b_real + b_imag * b_imag); + return std::complex((a_real * b_real + a_imag * b_imag) / norm, + (a_imag * b_real - a_real * b_imag) / norm); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex complex_divide_stable(const std::complex& a, const std::complex& b) { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); const T b_real = numext::real(b); const T b_imag = numext::imag(b); - // Guard against over/under-flow. - const T scale = T(1) / (numext::abs(b_real) + numext::abs(b_imag)); - const T a_real_scaled = numext::real(a) * scale; - const T a_imag_scaled = numext::imag(a) * scale; - const T b_real_scaled = b_real * scale; - const T b_imag_scaled = b_imag * scale; - - const T b_norm2_scaled = b_real_scaled * b_real_scaled + b_imag_scaled * b_imag_scaled; - return std::complex( - (a_real_scaled * b_real_scaled + a_imag_scaled * b_imag_scaled) / b_norm2_scaled, - (a_imag_scaled * b_real_scaled - a_real_scaled * b_imag_scaled) / b_norm2_scaled); + // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf), + // guards against over/under-flow. + const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real); + const T rscale = scale_imag ? T(1) : b_real / b_imag; + const T iscale = scale_imag ? b_imag / b_real : T(1); + const T denominator = b_real * rscale + b_imag * iscale; + return std::complex((a_real * rscale + a_imag * iscale) / denominator, + (a_imag * rscale - a_real * iscale) / denominator); } template -- GitLab From da19f7a9105374958cad7f9ee17c4596a54be51c Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 30 Apr 2021 08:19:48 -0700 Subject: [PATCH 015/266] Simplify TensorRandom and remove time-dependence. Time-dependence prevents tests from being repeatable. This has long been an issue with debugging the tensor tests. Removing this will allow future tests to be repeatable in the usual way. Also, the recently added macros in !476 are causing headaches across different platforms. For example, checking `_XOPEN_SOURCE` is leading to multiple ambiguous macro errors across Google, and `_DEFAULT_SOURCE`/`_SVID_SOURCE`/`_BSD_SOURCE` are sometimes defined with values, sometimes defined as empty, and sometimes not defined at all when they probably should be. This is leading to multiple build breakages. The simplest approach is to generate a seed via `Eigen::internal::random()` if on CPU. For GPU, we use a hash based on the current thread ID (since `rand()` isn't supported on GPU). Fixes #1602. (cherry picked from commit e3b7f59659689015aa254ed67c48d870831f086f) --- unsupported/Eigen/CXX11/Tensor | 8 --- .../Eigen/CXX11/src/Tensor/TensorRandom.h | 50 ++----------------- 2 files changed, 4 insertions(+), 54 deletions(-) diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index d73c6008d..0938bb554 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -41,14 +41,6 @@ #include #include -#ifdef _WIN32 -#include -#elif defined(__APPLE__) -#include -#else -#include -#endif - #if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) #include "ThreadPool" #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 9a20b53bb..37c1d1c3d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -21,53 +21,11 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { // We don't support 3d kernels since we currently only use 1 and // 2d kernels. gpu_assert(threadIdx.z == 0); - return clock64() + - blockIdx.x * blockDim.x + threadIdx.x + - gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); - -#elif defined _WIN32 - // Use the current time as a baseline. - SYSTEMTIME st; - GetSystemTime(&st); - int time = st.wSecond + 1000 * st.wMilliseconds; - // Mix in a random number to make sure that we get different seeds if - // we try to generate seeds faster than the clock resolution. - // We need 2 random values since the generator only generate 16 bits at - // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) - unsigned rnd1 = static_cast(::rand()); - unsigned rnd2 = static_cast(::rand()); - uint64_t rnd = (rnd1 ^ (rnd2 << 16)) ^ time; - return rnd; - -#elif defined __APPLE__ - // Same approach as for win32, except that the random number generator - // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). - uint64_t rnd = ::random() ^ mach_absolute_time(); - return rnd; - -#else - // Augment the current time with pseudo random number generation - // to ensure that we get different seeds if we try to generate seeds - // faster than the clock resolution. - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - - - // Check for BSD random(). -#if EIGEN_COMP_GNUC && (\ - defined(_XOPEN_SOURCE) && _XOPEN_SOURCE >= 500 \ - || /* Glibc since 2.19: */ (defined(_DEFAULT_SOURCE) && _DEFAULT_SOURCE) \ - || /* Glibc <= 2.19: */ (defined(_SVID_SOURCE) && _SVID_SOURCE) \ - || (defined(_BSD_SOURCE) && _BSD_SOURCE) \ - ) - uint64_t rnd = ::random(); + return blockIdx.x * blockDim.x + threadIdx.x + + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); #else - // Build random from rand() - unsigned rnd1 = static_cast(::rand()); - unsigned rnd2 = static_cast(::rand()); - uint64_t rnd = (rnd1 ^ (rnd2 << 16)); -#endif - return rnd ^ ts.tv_nsec; + // Rely on Eigen's random implementation. + return random(); #endif } -- GitLab From 9e0dc8f09b03fd60b87810d1de14ef122efb685c Mon Sep 17 00:00:00 2001 From: Christoph Hertzberg Date: Thu, 6 May 2021 18:36:47 +0200 Subject: [PATCH 016/266] Revert addition of unused `paddsub`. This fixes #2242 (cherry picked from commit 722ca0b665666f3af579002ad752541d7319d1b6) --- Eigen/src/Core/arch/NEON/Complex.h | 7 ------- Eigen/src/Core/arch/SSE/Complex.h | 6 ------ 2 files changed, 13 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 1aa361bc0..a889ab1d2 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -124,13 +124,6 @@ template<> EIGEN_STRONG_INLINE Packet1cf psub(const Packet1cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b); -template<> EIGEN_STRONG_INLINE Packet2cf paddsub(const Packet2cf& a, const Packet2cf& b) -{ - Packet4f mask = {-0.0f, -0.0f, 0.0f, 0.0f}; - return Packet2cf(padd(a.v, pxor(mask, b.v))); -} - template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index b1edfa4b2..13b53242e 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -66,12 +66,6 @@ template<> struct unpacket_traits { template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b); -template<> EIGEN_STRONG_INLINE Packet2cf paddsub(const Packet2cf& a, const Packet2cf& b) -{ - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x0,0x0)); - return Packet2cf(padd(a.v, pxor(mask, b.v))); -} template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { -- GitLab From 42acbd570028c5dee7e6dbfcfe0ea614f09d9d75 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 7 May 2021 08:24:32 -0700 Subject: [PATCH 017/266] Fix numext::arg return type. The cxx11 path for `numext::arg` incorrectly returned the complex type instead of the real type, leading to compile errors. Fixed this and added tests. Related to !477, which uncovered the issue. (cherry picked from commit 90e9a33e1ce3e4e7663dd67e6c1f225afaf5c206) --- Eigen/src/Core/MathFunctions.h | 9 +++++---- test/numext.cpp | 18 +++++++++++++++++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 29201214f..67b1d8263 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -592,8 +592,9 @@ struct arg_default_impl; template struct arg_default_impl { + typedef typename NumTraits::Real RealScalar; EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) + static inline RealScalar run(const Scalar& x) { #if defined(EIGEN_HIP_DEVICE_COMPILE) // HIP does not seem to have a native device side implementation for the math routine "arg" @@ -601,7 +602,7 @@ struct arg_default_impl { #else EIGEN_USING_STD(arg); #endif - return static_cast(arg(x)); + return static_cast(arg(x)); } }; @@ -612,7 +613,7 @@ struct arg_default_impl { EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); + return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); } }; #else @@ -623,7 +624,7 @@ struct arg_default_impl EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); + return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); } }; diff --git a/test/numext.cpp b/test/numext.cpp index cf1ca173d..8a2fde501 100644 --- a/test/numext.cpp +++ b/test/numext.cpp @@ -61,6 +61,20 @@ void check_abs() { } } +template +void check_arg() { + typedef typename NumTraits::Real Real; + VERIFY_IS_EQUAL(numext::abs(T(0)), T(0)); + VERIFY_IS_EQUAL(numext::abs(T(1)), T(1)); + + for(int k=0; k<100; ++k) + { + T x = internal::random(); + Real y = numext::arg(x); + VERIFY_IS_APPROX( y, std::arg(x) ); + } +} + template struct check_sqrt_impl { static void run() { @@ -242,10 +256,12 @@ EIGEN_DECLARE_TEST(numext) { CALL_SUBTEST( check_abs() ); CALL_SUBTEST( check_abs() ); CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs >() ); CALL_SUBTEST( check_abs >() ); + CALL_SUBTEST( check_arg >() ); + CALL_SUBTEST( check_arg >() ); + CALL_SUBTEST( check_sqrt() ); CALL_SUBTEST( check_sqrt() ); CALL_SUBTEST( check_sqrt >() ); -- GitLab From 25424f4cf1c565803677e3814f093edb1585d75f Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 6 May 2021 12:50:51 -0700 Subject: [PATCH 018/266] Clean up gpu device properties. Made a class and singleton to encapsulate initialization and retrieval of device properties. Related to !481, which already changed the API to address a static linkage issue. (cherry picked from commit 0eba8a1fe3e0fa78f0e6760c0e1265817491845d) --- .../Eigen/CXX11/src/Tensor/TensorDeviceGpu.h | 116 +++++++++++------- 1 file changed, 69 insertions(+), 47 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index d5eff9dc4..ec2e3cb14 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -42,58 +42,84 @@ class StreamInterface { virtual unsigned int* semaphore() const = 0; }; -EIGEN_STRONG_INLINE gpuDeviceProp_t*& getDeviceProperties() { - static gpuDeviceProp_t* deviceProperties; - return deviceProperties; -} +class GpuDeviceProperties { + public: + GpuDeviceProperties() : + initialized_(false), first_(true), device_properties_(nullptr) {} + + ~GpuDeviceProperties() { + if (device_properties_) { + delete[] device_properties_; + } + } + + EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { + return device_properties_[device]; + } -EIGEN_STRONG_INLINE bool& getDevicePropInitialized() { - static bool devicePropInitialized = false; - return devicePropInitialized; -} + EIGEN_STRONG_INLINE bool isInitialized() const { + return initialized_; + } -static void initializeDeviceProp() { - if (!getDevicePropInitialized()) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. - static std::atomic first(true); - if (first.exchange(false)) { - // We're the first thread to reach this point. - int num_devices; - gpuError_t status = gpuGetDeviceCount(&num_devices); - if (status != gpuSuccess) { - std::cerr << "Failed to get the number of GPU devices: " - << gpuGetErrorString(status) - << std::endl; - gpu_assert(status == gpuSuccess); - } - getDeviceProperties() = new gpuDeviceProp_t[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = gpuGetDeviceProperties(&getDeviceProperties()[i], i); + void initialize() { + if (!initialized_) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. + if (first_.exchange(false)) { + // We're the first thread to reach this point. + int num_devices; + gpuError_t status = gpuGetDeviceCount(&num_devices); if (status != gpuSuccess) { - std::cerr << "Failed to initialize GPU device #" - << i - << ": " + std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl; gpu_assert(status == gpuSuccess); } - } + device_properties_ = new gpuDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = gpuGetDeviceProperties(&device_properties_[i], i); + if (status != gpuSuccess) { + std::cerr << "Failed to initialize GPU device #" + << i + << ": " + << gpuGetErrorString(status) + << std::endl; + gpu_assert(status == gpuSuccess); + } + } - std::atomic_thread_fence(std::memory_order_release); - getDevicePropInitialized() = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!getDevicePropInitialized()) { - std::atomic_thread_fence(std::memory_order_acquire); - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + std::atomic_thread_fence(std::memory_order_release); + initialized_ = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!initialized_) { + std::atomic_thread_fence(std::memory_order_acquire); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } } } } + + private: + volatile bool initialized_; + std::atomic first_; + gpuDeviceProp_t* device_properties_; +}; + +EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { + static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); + if (!deviceProperties->isInitialized()) { + deviceProperties->initialize(); + } + return *deviceProperties; +} + +EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { + return GetGpuDeviceProperties().get(device); } static const gpuStream_t default_stream = gpuStreamDefault; @@ -103,12 +129,9 @@ class GpuStreamDevice : public StreamInterface { // Use the default stream on the current device GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { gpuGetDevice(&device_); - initializeDeviceProp(); } // Use the default stream on the specified device - GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { - initializeDeviceProp(); - } + GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {} // Use the specified stream. Note that it's the // caller responsibility to ensure that the stream can run on // the specified device. If no device is specified the code @@ -125,7 +148,6 @@ class GpuStreamDevice : public StreamInterface { gpu_assert(device < num_devices); device_ = device; } - initializeDeviceProp(); } virtual ~GpuStreamDevice() { @@ -136,7 +158,7 @@ class GpuStreamDevice : public StreamInterface { const gpuStream_t& stream() const { return *stream_; } const gpuDeviceProp_t& deviceProperties() const { - return getDeviceProperties()[device_]; + return GetGpuDeviceProperties(device_); } virtual void* allocate(size_t num_bytes) const { gpuError_t err = gpuSetDevice(device_); -- GitLab From 2947c0cc846af09facb5b438953fab02881ed967 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 6 May 2021 19:49:49 -0700 Subject: [PATCH 019/266] Restore ABI compatibility for conj with 3.3, fix conflict with boost. The boost library unfortunately specializes `conj` for various types and assumes the original two-template-parameter version. This changes restores the second parameter. This also restores ABI compatibility. The specialization for `std::complex` is because `std::conj` is not a device function. For custom complex scalar types, users should provide their own `conj` implementation. We may consider removing the unnecessary second parameter in the future - but this will require modifying boost as well. Fixes #2112. (cherry picked from commit c0eb5f89a406243f71eae0b705eba4437d9f8565) --- Eigen/src/Core/MathFunctions.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 67b1d8263..f77724052 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -260,16 +260,17 @@ struct conj_default_impl } }; -template struct conj_impl : conj_default_impl {}; +template::IsComplex> +struct conj_impl : conj_default_impl {}; #if defined(EIGEN_GPU_COMPILE_PHASE) template -struct conj_impl > +struct conj_impl, true> { EIGEN_DEVICE_FUNC static inline std::complex run(const std::complex& x) { - return std::complex(x.real(), -x.imag()); + return std::complex(numext::real(x), -numext::imag(x)); } }; #endif -- GitLab From 85ebd6aff89906124a2b44a4e5e65815a36aff1d Mon Sep 17 00:00:00 2001 From: Rohit Santhanam Date: Mon, 10 May 2021 19:20:32 +0000 Subject: [PATCH 020/266] Fix for issue where numext::imag and numext::real are used before they are defined. (cherry picked from commit 39ec31c0adbdde6b8cda36b3415e9cc2af20dab6) --- Eigen/src/Core/MathFunctions.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index f77724052..7f82090a9 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -263,18 +263,6 @@ struct conj_default_impl template::IsComplex> struct conj_impl : conj_default_impl {}; -#if defined(EIGEN_GPU_COMPILE_PHASE) -template -struct conj_impl, true> -{ - EIGEN_DEVICE_FUNC - static inline std::complex run(const std::complex& x) - { - return std::complex(numext::real(x), -numext::imag(x)); - } -}; -#endif - template struct conj_retval { @@ -2024,6 +2012,18 @@ struct rsqrt_impl { } }; +#if defined(EIGEN_GPU_COMPILE_PHASE) +template +struct conj_impl, true> +{ + EIGEN_DEVICE_FUNC + static inline std::complex run(const std::complex& x) + { + return std::complex(numext::real(x), -numext::imag(x)); + } +}; +#endif + } // end namespace internal } // end namespace Eigen -- GitLab From d9288f078d22e6f00f24ea29b5b669c09a0d8628 Mon Sep 17 00:00:00 2001 From: Nathan Luehr Date: Fri, 16 Apr 2021 14:04:20 -0500 Subject: [PATCH 021/266] Fix ambiguity due to argument dependent lookup. (cherry picked from commit 6753f0f197e7b8a8019e82e7b144ac0281d6a7f1) --- Eigen/src/Core/functors/UnaryFunctors.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index c98fa573c..16136d185 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -109,7 +109,7 @@ struct functor_traits > template struct scalar_conjugate_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op) EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); } + EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); } }; @@ -138,7 +138,7 @@ struct functor_traits > template struct scalar_arg_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op) typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::parg(a); } -- GitLab From d1825cbb68641b3be3c066178e99b0791180c18b Mon Sep 17 00:00:00 2001 From: Nathan Luehr Date: Mon, 19 Apr 2021 18:05:27 -0500 Subject: [PATCH 022/266] Device implementation of log for std::complex types. (cherry picked from commit 7e6a1c129c201db4eff46f4dd68acdc7e935eaf2) --- Eigen/src/Core/MathFunctions.h | 30 +++++++++++++++++++++++++++--- Eigen/src/Core/MathFunctionsImpl.h | 9 +++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 7f82090a9..d7ac4d64d 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2006-2010 Benoit Jacob +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -687,6 +688,30 @@ struct expm1_retval typedef Scalar type; }; +/**************************************************************************** +* Implementation of log * +****************************************************************************/ + +// Complex log defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z); + +template +struct log_impl { + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) + { + EIGEN_USING_STD(log); + return static_cast(log(x)); + } +}; + +template +struct log_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run(const std::complex& z) + { + return complex_log(z); + } +}; + /**************************************************************************** * Implementation of log1p * ****************************************************************************/ @@ -700,7 +725,7 @@ namespace std_fallback { typedef typename NumTraits::Real RealScalar; EIGEN_USING_STD(log); Scalar x1p = RealScalar(1) + x; - Scalar log_1p = log(x1p); + Scalar log_1p = log_impl::run(x1p); const bool is_small = numext::equal_strict(x1p, Scalar(1)); const bool is_inf = numext::equal_strict(x1p, log_1p); return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1))); @@ -1460,8 +1485,7 @@ T rsqrt(const T& x) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T &x) { - EIGEN_USING_STD(log); - return static_cast(log(x)); + return internal::log_impl::run(x); } #if defined(SYCL_DEVICE_ONLY) diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 0d3f317bb..4eaaaa784 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -184,6 +184,15 @@ EIGEN_DEVICE_FUNC std::complex complex_rsqrt(const std::complex& z) { : std::complex(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz ); } +template +EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z) { + // Computes complex log. + T a = numext::abs(z); + EIGEN_USING_STD(atan2); + T b = atan2(z.imag(), z.real()); + return std::complex(numext::log(a), b); +} + } // end namespace internal } // end namespace Eigen -- GitLab From 82f13830e6ab8affd2eb129e7b51d3a3d4fc43c9 Mon Sep 17 00:00:00 2001 From: Nathan Luehr Date: Tue, 11 May 2021 22:47:49 +0000 Subject: [PATCH 023/266] Fix calls to device functions from host code (cherry picked from commit 972cf0c28a8d2ee0808c1277dea2c5c206591ce6) --- .../Eigen/CXX11/src/Tensor/TensorArgMax.h | 12 ++--- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 8 +-- .../CXX11/src/Tensor/TensorBroadcasting.h | 9 ++-- .../Eigen/CXX11/src/Tensor/TensorChipping.h | 8 +-- .../CXX11/src/Tensor/TensorConcatenation.h | 8 +-- .../CXX11/src/Tensor/TensorContraction.h | 10 ++-- .../CXX11/src/Tensor/TensorContractionGpu.h | 4 +- .../CXX11/src/Tensor/TensorContractionSycl.h | 6 +-- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 18 +++---- .../CXX11/src/Tensor/TensorConvolution.h | 8 +-- .../CXX11/src/Tensor/TensorConvolutionSycl.h | 6 +-- .../Eigen/CXX11/src/Tensor/TensorCustomOp.h | 16 +++--- .../Eigen/CXX11/src/Tensor/TensorEvalTo.h | 10 ++-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 53 +++++++++---------- .../Eigen/CXX11/src/Tensor/TensorFFT.h | 6 +-- .../Eigen/CXX11/src/Tensor/TensorForcedEval.h | 9 ++-- .../Eigen/CXX11/src/Tensor/TensorGenerator.h | 6 +-- .../Eigen/CXX11/src/Tensor/TensorImagePatch.h | 28 +++++----- .../Eigen/CXX11/src/Tensor/TensorInflation.h | 6 +-- .../Eigen/CXX11/src/Tensor/TensorLayoutSwap.h | 8 +-- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 28 +++++----- .../Eigen/CXX11/src/Tensor/TensorPadding.h | 8 +-- .../Eigen/CXX11/src/Tensor/TensorPatch.h | 6 +-- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 25 ++------- .../Eigen/CXX11/src/Tensor/TensorRef.h | 8 +-- .../Eigen/CXX11/src/Tensor/TensorReverse.h | 12 ++--- .../Eigen/CXX11/src/Tensor/TensorScan.h | 5 +- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 11 ++-- .../Eigen/CXX11/src/Tensor/TensorStriding.h | 8 +-- .../Eigen/CXX11/src/Tensor/TensorTrace.h | 6 +-- .../CXX11/src/Tensor/TensorVolumePatch.h | 36 ++++++------- 31 files changed, 182 insertions(+), 210 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 91a6f8d6c..8b8fb9235 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -99,18 +99,18 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -240,7 +240,7 @@ struct TensorEvaluator, Devi typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), m_return_dim(op.return_dim()) @@ -263,11 +263,11 @@ struct TensorEvaluator, Devi return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 72f072cf2..ca0453f79 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -124,7 +124,7 @@ struct TensorEvaluator, Device> RightTensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { @@ -142,7 +142,7 @@ struct TensorEvaluator, Device> return m_rightImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); m_leftImpl.evalSubExprsIfNeeded(NULL); // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non @@ -154,7 +154,7 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { m_rightImpl.evalSubExprsIfNeededAsync( @@ -163,7 +163,7 @@ struct TensorEvaluator, Device> } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index fc75c8d9a..a354132f6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -138,8 +138,7 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : isCopy(false), nByOne(false), oneByN(false), m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device) { @@ -211,20 +210,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 7c6bbd180..376457341 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -164,7 +164,7 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -200,12 +200,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -433,7 +433,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor TensorBlockDesc; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 0dfe21604..5235a8e6f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -119,7 +119,7 @@ struct TensorEvaluator(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -172,14 +172,14 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 424caced1..d442c782c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -417,7 +417,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator typedef DSizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), op.rhsExpression()), device), @@ -602,7 +602,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { @@ -617,7 +617,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType dest, EvalSubExprsCallback done) { m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { @@ -908,7 +908,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator kernel.deallocate(this->m_device, packed_mem); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -1005,7 +1005,7 @@ struct TensorEvaluator Dimensions; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index bb990b378..c81803827 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -1270,7 +1270,7 @@ struct TensorEvaluator::value), @@ -1278,7 +1278,7 @@ struct TensorEvaluatorm_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index a6ca1777a..473c22849 100755 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -1340,10 +1340,10 @@ struct TensorEvaluatorm_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); if (!data) { @@ -1630,7 +1630,7 @@ struct TensorEvaluatorm_leftImpl.cleanup(); this->m_rightImpl.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 44493906d..4968babc1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -195,14 +195,14 @@ class TensorConversionOp : public TensorBase struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { impl.evalSubExprsIfNeeded(NULL); return true; } }; template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); } }; @@ -211,8 +211,7 @@ template struct ConversionSubExprEval< template struct ConversionSubExprEvalAsync { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run( - Eval& impl, EvalPointerType, EvalSubExprsCallback done) { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) { impl.evalSubExprsIfNeededAsync(nullptr, std::move(done)); } }; @@ -221,8 +220,7 @@ template struct ConversionSubExprEvalAsync { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run( - Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { impl.evalSubExprsIfNeededAsync(data, std::move(done)); } }; @@ -363,21 +361,21 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return ConversionSubExprEval, EvaluatorPointerType>::run(m_impl, data); } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType data, EvalSubExprsCallback done) { ConversionSubExprEvalAsync, EvaluatorPointerType, @@ -385,7 +383,7 @@ struct TensorEvaluator, Device> } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index df289e2c0..1b71023c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -320,7 +320,7 @@ struct TensorEvaluator(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -384,12 +384,12 @@ struct TensorEvaluator(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 92003c766..033318fdc 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -305,7 +305,7 @@ struct TensorEvaluator, Devi typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { m_dimensions = op.func().dimensions(op.expression()); @@ -114,7 +114,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (data) { evalTo(data); return false; @@ -126,7 +126,7 @@ struct TensorEvaluator, Devi } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_result) { m_device.deallocate_temp(m_result); m_result = NULL; @@ -157,7 +157,7 @@ struct TensorEvaluator, Devi #endif protected: - EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) { + void evalTo(EvaluatorPointerType data) { TensorMap > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.expression(), result, m_device); } @@ -279,7 +279,7 @@ struct TensorEvaluator > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 4689b0230..a48d035f5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -131,17 +131,17 @@ struct TensorEvaluator, Device> TensorBlockAssignment; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + EIGEN_STRONG_INLINE ~TensorEvaluator() { } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); @@ -149,7 +149,7 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType scalar, EvalSubExprsCallback done) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); @@ -191,7 +191,7 @@ struct TensorEvaluator, Device> block.cleanup(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d4532b72c..35fe643ea 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -63,7 +63,7 @@ struct TensorEvaluator TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) : m_data(device.get((const_cast(m.data())))), m_dims(m.dimensions()), m_device(device) @@ -72,7 +72,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { if (!NumTraits::type>::RequireInitialization && dest) { m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; @@ -82,14 +82,14 @@ struct TensorEvaluator #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType dest, EvalSubExprsCallback done) { // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. done(evalSubExprsIfNeeded(dest)); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {} + EIGEN_STRONG_INLINE void cleanup() {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data != NULL); @@ -262,13 +262,13 @@ struct TensorEvaluator TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (!NumTraits::type>::RequireInitialization && data) { m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; @@ -278,14 +278,14 @@ struct TensorEvaluator #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType dest, EvalSubExprsCallback done) { // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. done(evalSubExprsIfNeeded(dest)); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data != NULL); @@ -357,7 +357,6 @@ struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() { } @@ -391,17 +390,17 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { done(true); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -455,7 +454,7 @@ struct TensorEvaluator, Device> RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_device(device), m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) @@ -485,20 +484,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_argImpl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); } @@ -571,7 +570,7 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { // TODO(ezhulenev): Evaluate two expression in parallel? m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { @@ -631,7 +630,7 @@ struct TensorEvaluator RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_condImpl(op.ifExpression(), device), m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) @@ -886,7 +885,7 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); @@ -895,7 +894,7 @@ struct TensorEvaluator #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { @@ -905,7 +904,7 @@ struct TensorEvaluator } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_condImpl.cleanup(); m_thenImpl.cleanup(); m_elseImpl.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index c62bc5fa9..4a1a0687c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -144,7 +144,7 @@ struct TensorEvaluator, D typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); @@ -169,7 +169,7 @@ struct TensorEvaluator, D return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (data) { evalToBuf(data); @@ -181,7 +181,7 @@ struct TensorEvaluator, D } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_data) { m_device.deallocate(m_data); m_data = NULL; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h index 14020aa68..e800dedc6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h @@ -135,16 +135,13 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { const Index numValues = internal::array_prod(m_impl.dimensions()); m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); @@ -165,7 +162,7 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_THREADS template - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { const Index numValues = internal::array_prod(m_impl.dimensions()); m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp( @@ -185,7 +182,7 @@ struct TensorEvaluator, Device> } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_device.deallocate_temp(m_buffer); m_buffer = NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index b1ff1d8b1..174bf0683 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -111,7 +111,7 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_device(device), m_generator(op.generator()) { TensorEvaluator argImpl(op.expression(), device); @@ -136,10 +136,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 49d1004f3..dd51850b7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -242,7 +242,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) : m_device(device), m_impl(op.expression(), device) { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -389,20 +389,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -514,16 +514,16 @@ struct TensorEvaluator, Device> } #endif - Index rowPaddingTop() const { return m_rowPaddingTop; } - Index colPaddingLeft() const { return m_colPaddingLeft; } - Index outputRows() const { return m_outputRows; } - Index outputCols() const { return m_outputCols; } - Index userRowStride() const { return m_row_strides; } - Index userColStride() const { return m_col_strides; } - Index userInRowStride() const { return m_in_row_strides; } - Index userInColStride() const { return m_in_col_strides; } - Index rowInflateStride() const { return m_row_inflate_strides; } - Index colInflateStride() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 7dadec7fb..c5cb61af5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -103,7 +103,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) { m_dimensions = m_impl.dimensions(); @@ -137,11 +137,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index f159db1b9..80106c1a0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -113,7 +113,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { for(int i = 0; i < NumDims; ++i) { @@ -136,10 +136,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -191,7 +191,7 @@ template typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index ef79c8567..ea97cf185 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -142,7 +142,7 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { // The total size of the reshaped tensor must be equal to the total size @@ -154,16 +154,16 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType data, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(data, std::move(done)); } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -255,7 +255,7 @@ template RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -443,7 +443,7 @@ struct TensorEvaluator, Devi TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { for (Index i = 0; i < internal::array_size::value; ++i) { @@ -498,7 +498,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (!NumTraits::type>::RequireInitialization && data && m_impl.data()) { @@ -534,13 +534,13 @@ struct TensorEvaluator, Devi #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -738,7 +738,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockScratchAllocator TensorBlockScratch; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -906,7 +906,7 @@ struct TensorEvaluator, Device TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) { // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead @@ -151,20 +151,20 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 64a436e50..413d25dd4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -107,7 +107,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { Index num_patches = 1; @@ -152,12 +152,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 0a65591e6..583f46256 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -549,7 +549,7 @@ struct TensorReductionEvaluatorBase::value; static const bool RunningFullReduction = (NumOutputDims==0); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -631,13 +631,6 @@ struct TensorReductionEvaluatorBase EIGEN_STRONG_INLINE -#if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC -#endif void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) { @@ -759,19 +749,12 @@ struct TensorReductionEvaluatorBase class Ma struct TensorEvaluator, Device> : public TensorReductionEvaluatorBase, Device> { typedef TensorReductionEvaluatorBase, Device> Base; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} }; @@ -996,7 +979,7 @@ struct TensorEvaluator, : public TensorReductionEvaluatorBase, Eigen::SyclDevice> { typedef TensorReductionEvaluatorBase, Eigen::SyclDevice> Base; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel //Therefore the coeff function should be overridden by for SYCL kernel EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index 030d19844..a27d3646d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -388,17 +388,17 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) + EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) : m_ref(m) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_ref.coeff(index); @@ -439,7 +439,7 @@ struct TensorEvaluator, Device> : public TensorEvaluator& m, const Device& d) : Base(m, d) + EIGEN_STRONG_INLINE TensorEvaluator(TensorRef& m, const Device& d) : Base(m, d) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 3b1fca59b..586ce68ab 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -121,8 +121,7 @@ struct TensorEvaluator, Device TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reverse(op.reverse()), m_device(device) @@ -150,20 +149,20 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -426,8 +425,7 @@ struct TensorEvaluator, Device> CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} typedef typename XprType::Scalar Scalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index a06c4a9f3..beae854dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -402,8 +402,7 @@ struct TensorEvaluator, Device> { typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_exclusive(op.exclusive()), @@ -498,7 +497,7 @@ struct TensorEvaluator, Device> { return TensorOpCost(sizeof(CoeffReturnType), 0, 0); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_output) { m_device.deallocate_temp(m_output); m_output = NULL; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index e6fed3d0b..0999815d7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -118,8 +118,7 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_device(device), m_impl(op.expression(), device) { @@ -163,20 +162,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -384,7 +383,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor TensorBlockDesc; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 64bf3f139..2f62a668f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -109,7 +109,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { m_dimensions = m_impl.dimensions(); @@ -142,11 +142,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -277,7 +277,7 @@ struct TensorEvaluator, Device> RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } typedef typename XprType::Index Index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 24d22c189..926ecdd38 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -108,7 +108,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_traceDim(1), m_device(device) { @@ -211,12 +211,12 @@ struct TensorEvaluator, Device> return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 81bed57f3..0beb9ff09 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -194,7 +194,7 @@ struct TensorEvaluator, D typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -352,12 +352,12 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -518,21 +518,21 @@ struct TensorEvaluator, D const TensorEvaluator& impl() const { return m_impl; } - Index planePaddingTop() const { return m_planePaddingTop; } - Index rowPaddingTop() const { return m_rowPaddingTop; } - Index colPaddingLeft() const { return m_colPaddingLeft; } - Index outputPlanes() const { return m_outputPlanes; } - Index outputRows() const { return m_outputRows; } - Index outputCols() const { return m_outputCols; } - Index userPlaneStride() const { return m_plane_strides; } - Index userRowStride() const { return m_row_strides; } - Index userColStride() const { return m_col_strides; } - Index userInPlaneStride() const { return m_in_plane_strides; } - Index userInRowStride() const { return m_in_row_strides; } - Index userInColStride() const { return m_in_col_strides; } - Index planeInflateStride() const { return m_plane_inflate_strides; } - Index rowInflateStride() const { return m_row_inflate_strides; } - Index colInflateStride() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } #ifdef EIGEN_USE_SYCL // binding placeholder accessors to a command group handler for SYCL -- GitLab From 2f908f82555c029600a501806f66b8df82bed49a Mon Sep 17 00:00:00 2001 From: guoqiangqi Date: Mon, 10 May 2021 09:27:41 +0800 Subject: [PATCH 024/266] Changing the storage of the SSE complex packets to that of the wrapper. This should fix #2242 . (cherry picked from commit 3d9051ea84a5089b277c88dac456b3b1576bfa7f) --- Eigen/src/Core/arch/SSE/Complex.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 13b53242e..1cab374c0 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -19,7 +19,7 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} - __m128 v; + Packet4f v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going @@ -240,7 +240,7 @@ struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} - __m128d v; + Packet2d v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going -- GitLab From 77c66e368c7e355f8be299659f57b0ffcaedb505 Mon Sep 17 00:00:00 2001 From: Guoqiang QI Date: Thu, 13 May 2021 15:03:30 +0000 Subject: [PATCH 025/266] Ensure all generated matrices for inverse_4x4 testes are invertible, this fix #2248 . (cherry picked from commit 3e006bfd31e4389e8c5718c30409cddb65a73b04) --- test/prec_inverse_4x4.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp index 072466467..86f057118 100644 --- a/test/prec_inverse_4x4.cpp +++ b/test/prec_inverse_4x4.cpp @@ -30,18 +30,17 @@ template void inverse_general_4x4(int repeat) { using std::abs; typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; double error_sum = 0., error_max = 0.; for(int i = 0; i < repeat; ++i) { MatrixType m; - RealScalar absdet; + bool is_invertible; do { m = MatrixType::Random(); - absdet = abs(m.determinant()); - } while(absdet < NumTraits::epsilon()); + is_invertible = Eigen::FullPivLU(m).isInvertible(); + } while(!is_invertible); MatrixType inv = m.inverse(); - double error = double( (m*inv-MatrixType::Identity()).norm() * absdet / NumTraits::epsilon() ); + double error = double( (m*inv-MatrixType::Identity()).norm()); error_sum += error; error_max = (std::max)(error_max, error); } -- GitLab From 0bd9e9bc457f88968d6eba0ebf684bd456502f6c Mon Sep 17 00:00:00 2001 From: Jakub Lichman Date: Wed, 19 May 2021 08:26:45 +0000 Subject: [PATCH 026/266] ptranpose test for non-square kernels added (cherry picked from commit 8877f8d9b2631301ba070d645cdc3fc9b9f764f5) --- test/packetmath.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 0bb511d5a..18357df5e 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -548,6 +548,25 @@ void packetmath() { } } + const int m_size = PacketSize < 4 ? 1 : 4; + internal::PacketBlock kernel2; + for (int i = 0; i < m_size; ++i) { + kernel2.packet[i] = internal::pload(data1 + i * PacketSize); + } + ptranspose(kernel2); + int data_counter = 0; + for (int i = 0; i < PacketSize; ++i) { + for (int j = 0; j < m_size; ++j) { + data2[data_counter++] = data1[j*PacketSize + i]; + } + } + for (int i = 0; i < m_size; ++i) { + internal::pstore(data3, kernel2.packet[i]); + for (int j = 0; j < PacketSize; ++j) { + VERIFY(test::isApproxAbs(data3[j], data2[i*PacketSize + j], refvalue) && "ptranspose"); + } + } + if (PacketTraits::HasBlend) { Packet thenPacket = internal::pload(data1); Packet elsePacket = internal::pload(data2); -- GitLab From a883a8797cad3c824200b0d78541ead66e43cb5b Mon Sep 17 00:00:00 2001 From: Niall Murphy Date: Mon, 10 May 2021 11:43:49 +0100 Subject: [PATCH 027/266] Use derived object type in conservative_resize_like_impl When calling conservativeResize() on a matrix with DontAlign flag, the temporary variable used to perform the resize should have the same Options as the original matrix to ensure that the correct override of swap is called (i.e. PlainObjectBase::swap(DenseBase & other). Calling the base class swap (i.e in DenseBase) results in assertions errors or memory corruption. (cherry picked from commit 391094c50743f28f9174f455661f650bf07e0177) --- Eigen/src/Core/PlainObjectBase.h | 4 ++-- test/conservative_resize.cpp | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 202ed7100..e2ddbd1d5 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -1019,7 +1019,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(rows,cols); + Derived tmp(rows,cols); const Index common_rows = numext::mini(rows, _this.rows()); const Index common_cols = numext::mini(cols, _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); @@ -1054,7 +1054,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(other); + Derived tmp(other); const Index common_rows = numext::mini(tmp.rows(), _this.rows()); const Index common_cols = numext::mini(tmp.cols(), _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); diff --git a/test/conservative_resize.cpp b/test/conservative_resize.cpp index 5dc500068..d709e3346 100644 --- a/test/conservative_resize.cpp +++ b/test/conservative_resize.cpp @@ -148,6 +148,7 @@ EIGEN_DECLARE_TEST(conservative_resize) CALL_SUBTEST_4((run_matrix_tests, Eigen::ColMajor>())); CALL_SUBTEST_5((run_matrix_tests, Eigen::RowMajor>())); CALL_SUBTEST_5((run_matrix_tests, Eigen::ColMajor>())); + CALL_SUBTEST_1((run_matrix_tests())); CALL_SUBTEST_1((run_vector_tests())); CALL_SUBTEST_2((run_vector_tests())); -- GitLab From 4fbd01cd4b20894b63cef7c25f8bbc8f21dcef17 Mon Sep 17 00:00:00 2001 From: Steve Bronder Date: Fri, 21 May 2021 16:25:32 +0000 Subject: [PATCH 028/266] Adds macro for checking if C++14 variable templates are supported (cherry picked from commit 17200570239f23b2f0d3b434bc0269c46c409791) --- Eigen/src/Core/util/IntegralConstant.h | 2 +- Eigen/src/Core/util/Macros.h | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index ef3fdfb94..d457e02ee 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -184,7 +184,7 @@ template struct cleanup_index_type static const internal::FixedInt fix{}; #else diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 5862c5ebd..e5960d073 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -162,8 +162,8 @@ /// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++ // XLC version -// 3.1 0x0301 -// 4.5 0x0405 +// 3.1 0x0301 +// 4.5 0x0405 // 5.0 0x0500 // 12.1 0x0C01 #if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__) @@ -637,6 +637,14 @@ #define EIGEN_COMP_CXXVER 03 #endif +#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES + #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14 + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1 + #else + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0 + #endif +#endif + // The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features // but in practice we should not rely on them but rather on the availabilty of @@ -833,7 +841,7 @@ #endif #endif -// NOTE: the required Apple's clang version is very conservative +// NOTE: the required Apple's clang version is very conservative // and it could be that XCode 9 works just fine. // NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support // and not tested. @@ -962,7 +970,7 @@ #endif #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline)) // All functions callable from CUDA/HIP code must be qualified with __device__ -#elif defined(EIGEN_GPUCC) +#elif defined(EIGEN_GPUCC) #define EIGEN_DEVICE_FUNC __host__ __device__ #else #define EIGEN_DEVICE_FUNC @@ -989,7 +997,7 @@ #else #define eigen_plain_assert(x) #endif -#else +#else #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO namespace Eigen { namespace internal { -- GitLab From 383504630973d0da0ffc64e647b816c575ba42da Mon Sep 17 00:00:00 2001 From: Jakub Lichman Date: Fri, 21 May 2021 14:12:25 +0000 Subject: [PATCH 029/266] predux_half_dowto4 test extended to all applicable packets (cherry picked from commit 12471fcb5d59f969c60a9b78727624dc91e5c04e) --- test/packetmath.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 18357df5e..c81ca63c4 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -518,9 +518,7 @@ void packetmath() { for (int i = 0; i < PacketSize; ++i) ref[0] += data1[i]; VERIFY(test::isApproxAbs(ref[0], internal::predux(internal::pload(data1)), refvalue) && "internal::predux"); - if (PacketSize == 8 && internal::unpacket_traits::half>::size == - 4) // so far, predux_half_downto4 is only required in such a case - { + if (!internal::is_same::half>::value) { int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize; for (int i = 0; i < HalfPacketSize; ++i) ref[i] = Scalar(0); for (int i = 0; i < PacketSize; ++i) ref[i % HalfPacketSize] += data1[i]; -- GitLab From ee2a8f7139b200b7314b3a31c42238baaabe1942 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Sun, 23 May 2021 12:35:38 -0700 Subject: [PATCH 030/266] Modify Unary/Binary/TernaryOp evaluators to work for non-class types. This used to work for non-class types (e.g. raw function pointers) in Eigen 3.3. This was changed in commit 11f55b29 to optimize the evaluator: > `sizeof((A-B).cwiseAbs2())` with A,B Vector4f is now 16 bytes, instead of 48 before this optimization. though I cannot reproduce the 16 byte result. Both before the change and after, with multiple compilers/versions, I always get a result of 40 bytes. https://godbolt.org/z/MsjTc1PGe This change modifies the code slightly to allow non-class types. The final generated code is identical, and the expression remains 40 bytes for the `abs2` sample case. Fixes #2251 (cherry picked from commit ebb300d0b4340104dcade3afa656a57da2b7660c) --- Eigen/src/Core/CoreEvaluators.h | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 90c552f13..66e030b1c 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -606,13 +606,13 @@ struct unary_evaluator, IndexBased > protected: // this helper permits to completely eliminate the functor if it is empty - class Data : private UnaryOp + struct Data { - public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {} + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const UnaryOp& func() const { return static_cast(*this); } + const UnaryOp& func() const { return op; } + UnaryOp op; evaluator argImpl; }; @@ -700,12 +700,13 @@ struct ternary_evaluator, IndexBased protected: // this helper permits to completely eliminate the functor if it is empty - struct Data : private TernaryOp + struct Data { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : TernaryOp(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {} + Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TernaryOp& func() const { return static_cast(*this); } + const TernaryOp& func() const { return op; } + TernaryOp op; evaluator arg1Impl; evaluator arg2Impl; evaluator arg3Impl; @@ -793,12 +794,13 @@ struct binary_evaluator, IndexBased, IndexBase protected: // this helper permits to completely eliminate the functor if it is empty - struct Data : private BinaryOp + struct Data { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : BinaryOp(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {} + Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const BinaryOp& func() const { return static_cast(*this); } + const BinaryOp& func() const { return op; } + BinaryOp op; evaluator lhsImpl; evaluator rhsImpl; }; @@ -858,12 +860,13 @@ struct unary_evaluator, IndexBased> protected: // this helper permits to completely eliminate the functor if it is empty - struct Data : private UnaryOp + struct Data { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {} + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const UnaryOp& func() const { return static_cast(*this); } + const UnaryOp& func() const { return op; } + UnaryOp op; evaluator argImpl; }; -- GitLab From 98cf1e076f4899cf42bd864eae8a87631cef0c9d Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Mon, 24 May 2021 21:34:35 -0700 Subject: [PATCH 031/266] Add missing NEON ptranspose implementations. Unified implementation using only `vzip`. (cherry picked from commit dba753a986b527a17c8cc62474d0487aec7c2b36) --- Eigen/src/Core/arch/NEON/PacketMath.h | 507 +++++++++++--------------- test/packetmath.cpp | 32 +- 2 files changed, 226 insertions(+), 313 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 2b48570d1..73a35c570 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -2774,352 +2774,265 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) return vget_lane_u32(vpmax_u32(tmp, tmp), 0); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const float32x2x2_t z = vzip_f32(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = z.val[0]; - kernel.packet[1] = z.val[1]; +// Helpers for ptranspose. +namespace detail { + +template +void zip_in_place(Packet& p1, Packet& p2); + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2f& p1, Packet2f& p2) { + const float32x2x2_t tmp = vzip_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); - const float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); - kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); - kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); - kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4f& p1, Packet4f& p2) { + const float32x4x2_t tmp = vzipq_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1)); - const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1)); - const int8x8x2_t zip8 = vzip_s8(a,b); - const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1])); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8c& p1, Packet8c& p2) { + const int8x8x2_t tmp = vzip_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0); - kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1); - kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0); - kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16c& p1, Packet16c& p2) { + const int8x16x2_t tmp = vzipq_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - int8x8x2_t zip8[4]; - uint16x4x2_t zip16[4]; - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - zip8[i] = vzip_s8(kernel.packet[i*2], kernel.packet[i*2+1]); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8uc& p1, Packet8uc& p2) { + const uint8x8x2_t tmp = vzip_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - zip16[i*2+j] = vzip_u16(vreinterpret_u16_s8(zip8[i*2].val[j]), vreinterpret_u16_s8(zip8[i*2+1].val[j])); - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16uc& p1, Packet16uc& p2) { + const uint8x16x2_t tmp = vzipq_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j])); - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - kernel.packet[i*4+j*2+k] = vreinterpret_s8_u32(z.val[k]); - } - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2i& p1, Packet2i& p2) { + const int32x2x2_t tmp = vzip_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - int8x16x2_t zip8[8]; - uint16x8x2_t zip16[8]; - uint32x4x2_t zip32[8]; - EIGEN_UNROLL_LOOP - for (int i = 0; i != 8; i++) - zip8[i] = vzipq_s8(kernel.packet[i*2], kernel.packet[i*2+1]); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4i& p1, Packet4i& p2) { + const int32x4x2_t tmp = vzipq_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_s8(zip8[i*2].val[j]), - vreinterpretq_u16_s8(zip8[i*2+1].val[j])); - } - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2ui& p1, Packet2ui& p2) { + const uint32x2x2_t tmp = vzip_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - zip32[i*4+j*2+k] = vzipq_u32(vreinterpretq_u32_u16(zip16[i*4+j].val[k]), - vreinterpretq_u32_u16(zip16[i*4+j+2].val[k])); - } - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4ui& p1, Packet4ui& p2) { + const uint32x4x2_t tmp = vzipq_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - kernel.packet[i*4+j*2] = vreinterpretq_s8_u32(vcombine_u32(vget_low_u32(zip32[i].val[j]), - vget_low_u32(zip32[i+4].val[j]))); - kernel.packet[i*4+j*2+1] = vreinterpretq_s8_u32(vcombine_u32(vget_high_u32(zip32[i].val[j]), - vget_high_u32(zip32[i+4].val[j]))); - } - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4s& p1, Packet4s& p2) { + const int16x4x2_t tmp = vzip_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1)); - const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1)); - const uint8x8x2_t zip8 = vzip_u8(a,b); - const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1])); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8s& p1, Packet8s& p2) { + const int16x8x2_t tmp = vzipq_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0); - kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1); - kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0); - kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4us& p1, Packet4us& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - uint8x8x2_t zip8[4]; - uint16x4x2_t zip16[4]; - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - zip8[i] = vzip_u8(kernel.packet[i*2], kernel.packet[i*2+1]); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8us& p1, Packet8us& p2) { + const uint16x8x2_t tmp = vzipq_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - zip16[i*2+j] = vzip_u16(vreinterpret_u16_u8(zip8[i*2].val[j]), vreinterpret_u16_u8(zip8[i*2+1].val[j])); - } +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[1]); +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j])); - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - kernel.packet[i*4+j*2+k] = vreinterpret_u8_u32(z.val[k]); - } - } +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - uint8x16x2_t zip8[8]; - uint16x8x2_t zip16[8]; - uint32x4x2_t zip32[8]; - EIGEN_UNROLL_LOOP - for (int i = 0; i != 8; i++) - zip8[i] = vzipq_u8(kernel.packet[i*2], kernel.packet[i*2+1]); +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[4]); + zip_in_place(kernel.packet[1], kernel.packet[5]); + zip_in_place(kernel.packet[2], kernel.packet[6]); + zip_in_place(kernel.packet[3], kernel.packet[7]); - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_u8(zip8[i*2].val[j]), - vreinterpretq_u16_u8(zip8[i*2+1].val[j])); - } + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[6]); + zip_in_place(kernel.packet[5], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[5]); + zip_in_place(kernel.packet[6], kernel.packet[7]); +} +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { + for (int i=0; i<4; ++i) { + const int m = (1 << i); EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { + for (int j=0; j& kernel) -{ - const int16x4x2_t zip16_1 = vzip_s16(kernel.packet[0], kernel.packet[1]); - const int16x4x2_t zip16_2 = vzip_s16(kernel.packet[2], kernel.packet[3]); - const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[0]), vreinterpret_u32_s16(zip16_2.val[0])); - const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[1]), vreinterpret_u32_s16(zip16_2.val[1])); +} // namespace detail - kernel.packet[0] = vreinterpret_s16_u32(zip32_1.val[0]); - kernel.packet[1] = vreinterpret_s16_u32(zip32_1.val[1]); - kernel.packet[2] = vreinterpret_s16_u32(zip32_2.val[0]); - kernel.packet[3] = vreinterpret_s16_u32(zip32_2.val[1]); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]); - const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]); - - const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0])); - const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1])); - - kernel.packet[0] = vreinterpretq_s16_u32(zip32_1.val[0]); - kernel.packet[1] = vreinterpretq_s16_u32(zip32_1.val[1]); - kernel.packet[2] = vreinterpretq_s16_u32(zip32_2.val[0]); - kernel.packet[3] = vreinterpretq_s16_u32(zip32_2.val[1]); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - const int8x16x2_t zip8_1 = vzipq_s8(kernel.packet[0], kernel.packet[1]); - const int8x16x2_t zip8_2 = vzipq_s8(kernel.packet[2], kernel.packet[3]); + const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1)); + const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1)); - const int16x8x2_t zip16_1 = vzipq_s16(vreinterpretq_s16_s8(zip8_1.val[0]), vreinterpretq_s16_s8(zip8_2.val[0])); - const int16x8x2_t zip16_2 = vzipq_s16(vreinterpretq_s16_s8(zip8_1.val[1]), vreinterpretq_s16_s8(zip8_2.val[1])); + const int8x8x2_t zip8 = vzip_s8(a,b); + const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1])); - kernel.packet[0] = vreinterpretq_s8_s16(zip16_1.val[0]); - kernel.packet[1] = vreinterpretq_s8_s16(zip16_1.val[1]); - kernel.packet[2] = vreinterpretq_s8_s16(zip16_2.val[0]); - kernel.packet[3] = vreinterpretq_s8_s16(zip16_2.val[1]); + kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1); } - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint8x16x2_t zip8_1 = vzipq_u8(kernel.packet[0], kernel.packet[1]); - const uint8x16x2_t zip8_2 = vzipq_u8(kernel.packet[2], kernel.packet[3]); - - const uint16x8x2_t zip16_1 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[0]), vreinterpretq_u16_u8(zip8_2.val[0])); - const uint16x8x2_t zip16_2 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[1]), vreinterpretq_u16_u8(zip8_2.val[1])); - - kernel.packet[0] = vreinterpretq_u8_u16(zip16_1.val[0]); - kernel.packet[1] = vreinterpretq_u8_u16(zip16_1.val[1]); - kernel.packet[2] = vreinterpretq_u8_u16(zip16_2.val[0]); - kernel.packet[3] = vreinterpretq_u8_u16(zip16_2.val[1]); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]); - const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]); - const int16x8x2_t zip16_3 = vzipq_s16(kernel.packet[4], kernel.packet[5]); - const int16x8x2_t zip16_4 = vzipq_s16(kernel.packet[6], kernel.packet[7]); + const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1)); + const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1)); - const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0])); - const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1])); - const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[0]), vreinterpretq_u32_s16(zip16_4.val[0])); - const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[1]), vreinterpretq_u32_s16(zip16_4.val[1])); + const uint8x8x2_t zip8 = vzip_u8(a,b); + const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1])); - kernel.packet[0] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0]))); - kernel.packet[1] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0]))); - kernel.packet[2] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1]))); - kernel.packet[3] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1]))); - kernel.packet[4] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0]))); - kernel.packet[5] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0]))); - kernel.packet[6] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1]))); - kernel.packet[7] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1]))); + kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint16x4x2_t zip16_1 = vzip_u16(kernel.packet[0], kernel.packet[1]); - const uint16x4x2_t zip16_2 = vzip_u16(kernel.packet[2], kernel.packet[3]); - - const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[0]), vreinterpret_u32_u16(zip16_2.val[0])); - const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[1]), vreinterpret_u32_u16(zip16_2.val[1])); - - kernel.packet[0] = vreinterpret_u16_u32(zip32_1.val[0]); - kernel.packet[1] = vreinterpret_u16_u32(zip32_1.val[1]); - kernel.packet[2] = vreinterpret_u16_u32(zip32_2.val[0]); - kernel.packet[3] = vreinterpret_u16_u32(zip32_2.val[1]); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint16x8x2_t zip16_1 = vzipq_u16(kernel.packet[0], kernel.packet[1]); - const uint16x8x2_t zip16_2 = vzipq_u16(kernel.packet[2], kernel.packet[3]); - const uint16x8x2_t zip16_3 = vzipq_u16(kernel.packet[4], kernel.packet[5]); - const uint16x8x2_t zip16_4 = vzipq_u16(kernel.packet[6], kernel.packet[7]); - - const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[0]), vreinterpretq_u32_u16(zip16_2.val[0])); - const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[1]), vreinterpretq_u32_u16(zip16_2.val[1])); - const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[0]), vreinterpretq_u32_u16(zip16_4.val[0])); - const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[1]), vreinterpretq_u32_u16(zip16_4.val[1])); - kernel.packet[0] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0]))); - kernel.packet[1] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0]))); - kernel.packet[2] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1]))); - kernel.packet[3] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1]))); - kernel.packet[4] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0]))); - kernel.packet[5] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0]))); - kernel.packet[6] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1]))); - kernel.packet[7] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1]))); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const int32x2x2_t z = vzip_s32(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = z.val[0]; - kernel.packet[1] = z.val[1]; +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); - const int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); - kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); - kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); - kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint32x2x2_t z = vzip_u32(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = z.val[0]; - kernel.packet[1] = z.val[1]; +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint32x4x2_t tmp1 = vzipq_u32(kernel.packet[0], kernel.packet[1]); - const uint32x4x2_t tmp2 = vzipq_u32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_u32(vget_low_u32(tmp1.val[0]), vget_low_u32(tmp2.val[0])); - kernel.packet[1] = vcombine_u32(vget_high_u32(tmp1.val[0]), vget_high_u32(tmp2.val[0])); - kernel.packet[2] = vcombine_u32(vget_low_u32(tmp1.val[1]), vget_low_u32(tmp2.val[1])); - kernel.packet[3] = vcombine_u32(vget_high_u32(tmp1.val[1]), vget_high_u32(tmp2.val[1])); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::zip_in_place(kernel.packet[0], kernel.packet[1]); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { #if EIGEN_ARCH_ARM64 const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]); - const int64x2_t tmp2 = vzip2q_s64(kernel.packet[0], kernel.packet[1]); - + kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]); kernel.packet[0] = tmp1; - kernel.packet[1] = tmp2; #else const int64x1_t tmp[2][2] = { { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) }, @@ -3135,10 +3048,8 @@ ptranspose(PacketBlock& kernel) { #if EIGEN_ARCH_ARM64 const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]); - const uint64x2_t tmp2 = vzip2q_u64(kernel.packet[0], kernel.packet[1]); - + kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]); kernel.packet[0] = tmp1; - kernel.packet[1] = tmp2; #else const uint64x1_t tmp[2][2] = { { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) }, @@ -3468,6 +3379,15 @@ template<> struct unpacket_traits }; }; +namespace detail { +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4bf& p1, Packet4bf& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} +} // namespace detail + EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) { // See the scalar implemention in BFloat16.h for a comprehensible explanation @@ -3674,16 +3594,7 @@ template<> EIGEN_STRONG_INLINE Packet4bf preverse(const Packet4bf& a) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - PacketBlock k; - k.packet[0] = kernel.packet[0]; - k.packet[1] = kernel.packet[1]; - k.packet[2] = kernel.packet[2]; - k.packet[3] = kernel.packet[3]; - ptranspose(k); - kernel.packet[0] = k.packet[0]; - kernel.packet[1] = k.packet[1]; - kernel.packet[2] = k.packet[2]; - kernel.packet[3] = k.packet[3]; + detail::ptranspose_impl(kernel); } template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff(const Packet4bf& a, const Packet4bf& b) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index c81ca63c4..121ec7283 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -546,22 +546,24 @@ void packetmath() { } } - const int m_size = PacketSize < 4 ? 1 : 4; - internal::PacketBlock kernel2; - for (int i = 0; i < m_size; ++i) { - kernel2.packet[i] = internal::pload(data1 + i * PacketSize); - } - ptranspose(kernel2); - int data_counter = 0; - for (int i = 0; i < PacketSize; ++i) { - for (int j = 0; j < m_size; ++j) { - data2[data_counter++] = data1[j*PacketSize + i]; + // GeneralBlockPanelKernel also checks PacketBlock; + if (PacketSize > 4 && PacketSize % 4 == 0) { + internal::PacketBlock kernel2; + for (int i = 0; i < 4; ++i) { + kernel2.packet[i] = internal::pload(data1 + i * PacketSize); } - } - for (int i = 0; i < m_size; ++i) { - internal::pstore(data3, kernel2.packet[i]); - for (int j = 0; j < PacketSize; ++j) { - VERIFY(test::isApproxAbs(data3[j], data2[i*PacketSize + j], refvalue) && "ptranspose"); + ptranspose(kernel2); + int data_counter = 0; + for (int i = 0; i < PacketSize; ++i) { + for (int j = 0; j < 4; ++j) { + data2[data_counter++] = data1[j*PacketSize + i]; + } + } + for (int i = 0; i < 4; ++i) { + internal::pstore(data3, kernel2.packet[i]); + for (int j = 0; j < PacketSize; ++j) { + VERIFY(test::isApproxAbs(data3[j], data2[i*PacketSize + j], refvalue) && "ptranspose"); + } } } -- GitLab From 573570b6c903d46b35530aeb7bf40918a34c7fb5 Mon Sep 17 00:00:00 2001 From: Cyril Kaiser Date: Sat, 22 May 2021 18:15:32 +0100 Subject: [PATCH 032/266] Remove EIGEN_DEVICE_FUNC from CwiseBinaryOp's default copy constructor. (cherry picked from commit 91cd67f057f90101cf858d63916ee56a58511b0d) --- Eigen/src/Core/CwiseBinaryOp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index 59974a545..2202b1cc6 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -102,7 +102,7 @@ class CwiseBinaryOp : #if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11 //Required for Visual Studio or the Copy constructor will probably not get inlined! - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_STRONG_INLINE CwiseBinaryOp(const CwiseBinaryOp&) = default; #endif -- GitLab From cbb6ae62969bdaadad612c18e7abc0056f54fbd2 Mon Sep 17 00:00:00 2001 From: Rohit Santhanam Date: Fri, 28 May 2021 20:06:48 +0000 Subject: [PATCH 033/266] Removed dead code from GPU float16 unit test. (cherry picked from commit c8d40a7bf1915015c991b108cf2cd6a32138fdc8) --- .../test/cxx11_tensor_of_float16_gpu.cu | 42 +++++++------------ 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu index 062f76e26..30bcc1d28 100644 --- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu @@ -329,26 +329,22 @@ void test_gpu_reductions(int size1, int size2, int redux) { int num_elem = size1*size2; int result_size = (redux == 1 ? size1 : size2); - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); - Eigen::TensorMap, Eigen::Aligned> gpu_float1( - d_float1, size1, size2); - Eigen::TensorMap, Eigen::Aligned> gpu_float2( - d_float2, size1, size2); + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, size1, size2); Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half, result_size); Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, result_size); - gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f; - gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f; + gpu_float.device(gpu_device) = gpu_float.random() * 2.0f; Eigen::array redux_dim = {redux}; - gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast(); - gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim); + gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast(); + gpu_res_half.device(gpu_device) = gpu_float.cast().sum(redux_dim); Tensor half_prec(result_size); Tensor full_prec(result_size); @@ -361,8 +357,7 @@ void test_gpu_reductions(int size1, int size2, int redux) { VERIFY_IS_APPROX(full_prec(i), half_prec(i)); } - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } @@ -386,25 +381,21 @@ void test_gpu_full_reductions() { int size = 13; int num_elem = size*size; - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); - Eigen::TensorMap, Eigen::Aligned> gpu_float1( - d_float1, size, size); - Eigen::TensorMap, Eigen::Aligned> gpu_float2( - d_float2, size, size); + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, size, size); Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half); Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float); - gpu_float1.device(gpu_device) = gpu_float1.random(); - gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_float.device(gpu_device) = gpu_float.random(); - gpu_res_float.device(gpu_device) = gpu_float1.sum().cast(); - gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(); + gpu_res_float.device(gpu_device) = gpu_float.sum().cast(); + gpu_res_half.device(gpu_device) = gpu_float.cast().sum(); Tensor half_prec; Tensor full_prec; @@ -414,16 +405,15 @@ void test_gpu_full_reductions() { VERIFY_IS_APPROX(full_prec(), half_prec()); - gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast(); - gpu_res_half.device(gpu_device) = gpu_float1.cast().maximum(); + gpu_res_float.device(gpu_device) = gpu_float.maximum().cast(); + gpu_res_half.device(gpu_device) = gpu_float.cast().maximum(); gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); gpu_device.synchronize(); VERIFY_IS_APPROX(full_prec(), half_prec()); - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } -- GitLab From 85868564df74c906de35bfff925287c4b732d07e Mon Sep 17 00:00:00 2001 From: Nicolas Cornu Date: Tue, 8 Jun 2021 15:48:21 +0200 Subject: [PATCH 034/266] Fix parsing of version for nvhpc As the first line of the version is empty it crashes, so delete first line if it is empty (cherry picked from commit 001a57519a7aa909d3bf0cd8c6ec8a9cd19d9c70) --- cmake/EigenTesting.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 0808446d6..c0b59929d 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -478,6 +478,7 @@ macro(ei_get_compilerver VAR) execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${EIGEN_CXX_FLAG_VERSION} OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "^[ \n\r]+" "" eigen_cxx_compiler_version_string ${eigen_cxx_compiler_version_string}) string(REGEX REPLACE "[\n\r].*" "" eigen_cxx_compiler_version_string ${eigen_cxx_compiler_version_string}) ei_get_compilerver_from_cxx_version_string("${eigen_cxx_compiler_version_string}" CNAME CVER) -- GitLab From 4b502a72156afa527dbc499a07a7edce9eafe15c Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 10 Jun 2021 17:17:39 -0700 Subject: [PATCH 035/266] Fix c++20 warnings about using enums in arithmetic expressions. (cherry picked from commit f64b2954c711b7846ae6ae228c5f14bd8dd56ec4) --- Eigen/src/Core/AssignEvaluator.h | 2 +- Eigen/src/Core/BooleanRedux.h | 4 ++-- Eigen/src/Core/CoreEvaluators.h | 6 +++--- Eigen/src/Core/NumTraits.h | 6 +++--- Eigen/src/Core/PartialReduxEvaluator.h | 2 +- Eigen/src/Core/ProductEvaluators.h | 2 +- Eigen/src/Core/Redux.h | 4 ++-- Eigen/src/Core/TriangularMatrix.h | 2 +- Eigen/src/Core/Visitor.h | 2 +- Eigen/src/Core/functors/BinaryFunctors.h | 6 +++--- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 8 ++++---- Eigen/src/SparseCore/SparseCwiseBinaryOp.h | 6 +++--- Eigen/src/SparseCore/SparseCwiseUnaryOp.h | 2 +- test/vectorization_logic.cpp | 4 ++-- 14 files changed, 28 insertions(+), 28 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index ab2ebf37e..f8c87d0fc 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -591,7 +591,7 @@ struct dense_assignment_loop enum { innerSize = DstXprType::InnerSizeAtCompileTime, packetSize =unpacket_traits::size, - vectorizableSize = (innerSize/packetSize)*packetSize, + vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize), size = DstXprType::SizeAtCompileTime }; for(Index outer = 0; outer < kernel.outerSize(); ++outer) diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h index e32c4ac5b..852de8b90 100644 --- a/Eigen/src/Core/BooleanRedux.h +++ b/Eigen/src/Core/BooleanRedux.h @@ -81,7 +81,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::all() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) @@ -105,7 +105,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::any() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 66e030b1c..97231f99d 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -561,7 +561,7 @@ struct unary_evaluator, IndexBased > typedef CwiseUnaryOp XprType; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = evaluator::Flags & (HereditaryBits | LinearAccessBit | (functor_traits::PacketAccess ? PacketAccessBit : 0)), @@ -736,7 +736,7 @@ struct binary_evaluator, IndexBased, IndexBase typedef CwiseBinaryOp XprType; enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), LhsFlags = evaluator::Flags, RhsFlags = evaluator::Flags, @@ -817,7 +817,7 @@ struct unary_evaluator, IndexBased> typedef CwiseUnaryView XprType; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = (evaluator::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)), diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index fdd4d4f51..72eac5a93 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -289,9 +289,9 @@ struct NumTraits > IsInteger = NumTraits::IsInteger, IsSigned = NumTraits::IsSigned, RequireInitialization = 1, - ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::ReadCost, - AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::AddCost, - MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::MulCost + ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::ReadCost), + AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::AddCost), + MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::MulCost) }; EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h index 0be694259..29abf35b9 100644 --- a/Eigen/src/Core/PartialReduxEvaluator.h +++ b/Eigen/src/Core/PartialReduxEvaluator.h @@ -145,7 +145,7 @@ struct evaluator > enum { CoeffReadCost = TraversalSize==Dynamic ? HugeCost : TraversalSize==0 ? 1 - : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), + : int(TraversalSize) * int(evaluator::CoeffReadCost) + int(CostOpType::value), _ArgFlags = evaluator::Flags, diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index b766e1a1d..8cf294b28 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -831,7 +831,7 @@ struct diagonal_product_evaluator_base typedef typename ScalarBinaryOpTraits::ReturnType Scalar; public: enum { - CoeffReadCost = NumTraits::MulCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost, + CoeffReadCost = int(NumTraits::MulCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost), MatrixFlags = evaluator::Flags, DiagFlags = evaluator::Flags, diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 30598f415..b6790d110 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -58,7 +58,7 @@ public: public: enum { Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost - : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, + : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) }; @@ -331,7 +331,7 @@ struct redux_impl enum { PacketSize = redux_traits::PacketSize, Size = Evaluator::SizeAtCompileTime, - VectorizedSize = (Size / PacketSize) * PacketSize + VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize) }; template diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 779152fa7..025a1142e 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -819,7 +819,7 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con enum { unroll = DstXprType::SizeAtCompileTime != Dynamic && SrcEvaluatorType::CoeffReadCost < HugeCost - && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT + && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT }; triangular_assignment_loop::run(kernel); diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index 07a2e4243..00bcca877 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h @@ -124,7 +124,7 @@ void DenseBase::visit(Visitor& visitor) const enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits::Cost <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits::Cost) <= EIGEN_UNROLLING_LIMIT }; return internal::visitor_impl::run(thisEval, visitor); } diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index a182b4b74..63f09ab93 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -50,7 +50,7 @@ struct scalar_sum_op : binary_op_base template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, // rough estimate! + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasAdd && packet_traits::HasAdd // TODO vectorize mixed sum }; @@ -88,7 +88,7 @@ struct scalar_product_op : binary_op_base template struct functor_traits > { enum { - Cost = (NumTraits::MulCost + NumTraits::MulCost)/2, // rough estimate! + Cost = (int(NumTraits::MulCost) + int(NumTraits::MulCost))/2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasMul && packet_traits::HasMul // TODO vectorize mixed product }; @@ -364,7 +364,7 @@ struct scalar_difference_op : binary_op_base template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, PacketAccess = is_same::value && packet_traits::HasSub && packet_traits::HasSub }; }; diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 79367f197..8362ecc02 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -1673,8 +1673,8 @@ void gebp_kernel::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -530,7 +530,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -604,7 +604,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index df6c28d2b..edf844cb8 100644 --- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -24,7 +24,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 7a853888b..65c653c94 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -159,11 +159,11 @@ struct vectorization_logic EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling)); VERIFY(test_assign(Matrix1(),Matrix1()+Matrix1(), - (Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal, + (int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal, CompleteUnrolling)); VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(), - EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) + EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal, CompleteUnrolling)); VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3), -- GitLab From 1cb1ffd5b2c36e13c74945290bf0edc60f7b830c Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 10 Jun 2021 19:18:50 -0700 Subject: [PATCH 036/266] Use bit_cast to create -0.0 for floating point types to avoid compiler optimization changing sign with --ffast-math enabled. (cherry picked from commit fc87e2cbaa65e7e93a2c695ce5a9dc048a64a985) --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 3 ++- Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 3 ++- Eigen/src/Core/arch/NEON/PacketMath.h | 6 +++--- Eigen/src/Core/arch/ZVector/PacketMath.h | 5 +++-- Eigen/src/Geometry/arch/Geometry_SIMD.h | 5 +++-- Eigen/src/LU/arch/InverseSize4.h | 6 +++--- 6 files changed, 16 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 7c70c07b1..d4aee3e21 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -2260,7 +2260,8 @@ static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull }; static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull }; static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); -static Packet2d p2d_MZERO = { -0.0, -0.0 }; +static Packet2d p2d_MZERO = { numext::bit_cast(0x8000000000000000ull), + numext::bit_cast(0x8000000000000000ull) }; #ifdef _BIG_ENDIAN static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 87e8c2703..8f1c1a874 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -839,7 +839,8 @@ Packet psqrt_complex(const Packet& a) { // Step 4. Compute solution for inputs with negative real part: // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1] - const RealPacket cst_imag_sign_mask = pset1(Scalar(RealScalar(0.0), RealScalar(-0.0))).v; + const RealScalar neg_zero = RealScalar(numext::bit_cast(0x80000000u)); + const RealPacket cst_imag_sign_mask = pset1(Scalar(RealScalar(0.0), neg_zero)).v; RealPacket imag_signs = pand(a.v, cst_imag_sign_mask); Packet negative_real_result; // Notice that rho is positive, so taking it's absolute value is a noop. diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 73a35c570..e1efe9bcb 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -866,12 +866,12 @@ template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, con template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b); template<> EIGEN_STRONG_INLINE Packet2f paddsub(const Packet2f& a, const Packet2f & b) { - Packet2f mask = {-0.0f, 0.0f}; + Packet2f mask = {numext::bit_cast(0x80000000u), 0.0f}; return padd(a, pxor(mask, b)); } template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { - Packet4f mask = {-0.0f, 0.0f, -0.0f, 0.0f}; + Packet4f mask = {numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f}; return padd(a, pxor(mask, b)); } @@ -3751,7 +3751,7 @@ template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b){ - const Packet2d mask = {-0.0,0.0}; + const Packet2d mask = {numext::bit_cast(0x8000000000000000ull),0.0}; return padd(a, pxor(mask, b)); } diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index b10c1f6c7..2246439cc 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -94,8 +94,9 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); -static Packet2d p2d_ONE = { 1.0, 1.0 }; -static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO_ = { numext::bit_cast0x8000000000000000ull), + numext::bit_cast0x8000000000000000ull) }; #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ diff --git a/Eigen/src/Geometry/arch/Geometry_SIMD.h b/Eigen/src/Geometry/arch/Geometry_SIMD.h index 9c15bfb98..89ac92062 100644 --- a/Eigen/src/Geometry/arch/Geometry_SIMD.h +++ b/Eigen/src/Geometry/arch/Geometry_SIMD.h @@ -146,8 +146,9 @@ struct quat_conj { evaluator qe(q.coeffs()); Quaternion res; - double arr1[2] = {-0.0, -0.0}; - double arr2[2] = {-0.0, 0.0}; + const double neg_zero = numext::bit_cast(0x8000000000000000ull); + double arr1[2] = {neg_zero, neg_zero}; + double arr2[2] = {neg_zero, 0.0}; const Packet2d mask0 = pset(arr1); const Packet2d mask2 = pset(arr2); pstoret(&res.x(), pxor(mask0, qe.template packet::Alignment,Packet2d>(0))); diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index ee5548aed..106224bbc 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -143,7 +143,7 @@ struct compute_inverse_size4(0x80000000u), numext::bit_cast(0x80000000u), 0.0f}; const Packet4f p4f_sign_PNNP = pset(sign_mask); rd = pxor(rd, p4f_sign_PNNP); iA = pmul(iA, rd); @@ -326,8 +326,8 @@ struct compute_inverse_size4(0x8000000000000000ull)}; + const double sign_mask2[2] = {numext::bit_cast(0x8000000000000000ull), 0.0}; const Packet2d sign_PN = pset(sign_mask1); const Packet2d sign_NP = pset(sign_mask2); d1 = pxor(rd, sign_PN); -- GitLab From 4b683b65df6703e083a05d2c5d1578e3e7cf71f4 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 11 Jun 2021 08:30:41 -0700 Subject: [PATCH 037/266] Allow custom TENSOR_CONTRACTION_DISPATCH macro. Currently TF lite needs to hack around with the Tensor headers in order to customize the contraction dispatch method. Here we add simple `#ifndef` guards to allow them to provide their own dispatch prior to inclusion. (cherry picked from commit 6aec83263d32c29f6c5623b9716ec7e367693078) --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index d442c782c..8b35f7985 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -633,6 +633,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator } #endif // EIGEN_USE_THREADS +#ifndef TENSOR_CONTRACTION_DISPATCH #define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ if (this->m_lhs_inner_dim_contiguous) { \ if (this->m_rhs_inner_dim_contiguous) { \ @@ -663,7 +664,9 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator } \ } \ } +#endif +#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH #define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ if (this->m_lhs_inner_dim_contiguous) { \ if (this->m_rhs_inner_dim_contiguous) { \ @@ -694,6 +697,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator } \ } \ } +#endif EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { static_cast(this)->template evalProduct(buffer); -- GitLab From b5fc69bdd8e25581f1acc4ca37fb956499816936 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 11 Jun 2021 08:21:34 -0700 Subject: [PATCH 038/266] Add ability to permanently enable HIP/CUDA gpu* defines. When using Eigen for gpu, these simplify portability. If `EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES` is set, then we do not undefine them. (cherry picked from commit 514977f31b1c00b233969f12321a25d859dd1efa) --- unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h index cb53ce298..a89ea3e9a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -11,6 +11,8 @@ #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) #define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H +#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + // Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design // There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but // for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler @@ -96,4 +98,6 @@ #endif // gpu_assert +#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + #endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H -- GitLab From 5e75331b9f2bd890b0727841d5e3bb2d826d377d Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 11 Jun 2021 10:21:07 -0700 Subject: [PATCH 039/266] Fix checking of version number for mingw. MinGW spits out version strings like: `x86_64-w64-mingw32-g++ (GCC) 10-win32 20210110`, which causes the version extraction to fail. Added support for this with tests. Also added `make_unsigned` for `long long`, since mingw seems to use that for `uint64_t`. Related to #2268. CMake and build passes for me after this. (cherry picked from commit ad82d20cf649ba8c07352f947fd25766d0328df2) --- Eigen/src/Core/util/Meta.h | 2 ++ cmake/EigenTesting.cmake | 21 ++++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index f66325f89..b6aaed138 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -189,6 +189,8 @@ template<> struct make_unsigned { typedef unsigned int type; } template<> struct make_unsigned { typedef unsigned int type; }; template<> struct make_unsigned { typedef unsigned long type; }; template<> struct make_unsigned { typedef unsigned long type; }; +template<> struct make_unsigned { typedef unsigned long long type; }; +template<> struct make_unsigned { typedef unsigned long long type; }; #if EIGEN_COMP_MSVC template<> struct make_unsigned { typedef unsigned __int64 type; }; template<> struct make_unsigned { typedef unsigned __int64 type; }; diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index c0b59929d..eb8457db6 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -488,9 +488,10 @@ macro(ei_get_compilerver VAR) endmacro() # Extract compiler name and version from a raw version string -# WARNING: if you edit thid macro, then please test it by uncommenting +# WARNING: if you edit this macro, then please test it by uncommenting # the testing macro call in ei_init_testing() of the EigenTesting.cmake file. -# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end of the file +# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end +# of the file macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER) # extract possible compiler names string(REGEX MATCH "g\\+\\+" ei_has_gpp ${VERSTRING}) @@ -498,6 +499,7 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER) string(REGEX MATCH "gcc|GCC" ei_has_gcc ${VERSTRING}) string(REGEX MATCH "icpc|ICC" ei_has_icpc ${VERSTRING}) string(REGEX MATCH "clang|CLANG" ei_has_clang ${VERSTRING}) + string(REGEX MATCH "mingw32" ei_has_mingw ${VERSTRING}) # combine them if((ei_has_llvm) AND (ei_has_gpp OR ei_has_gcc)) @@ -506,6 +508,8 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER) set(${CNAME} "llvm-clang++") elseif(ei_has_clang) set(${CNAME} "clang++") + elseif ((ei_has_mingw) AND (ei_has_gpp OR ei_has_gcc)) + set(${CNAME} "mingw32-g++") elseif(ei_has_icpc) set(${CNAME} "icpc") elseif(ei_has_gpp OR ei_has_gcc) @@ -526,11 +530,17 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER) if(NOT eicver) # try to extract 2: string(REGEX MATCH "[^0-9][0-9]+\\.[0-9]+" eicver ${VERSTRING}) - else() - set(eicver " _") + if (NOT eicver AND ei_has_mingw) + # try to extract 1 number plus suffix: + string(REGEX MATCH "[^0-9][0-9]+-win32" eicver ${VERSTRING}) + endif() endif() endif() endif() + + if (NOT eicver) + set(eicver " _") + endif() string(REGEX REPLACE ".(.*)" "\\1" ${CVER} ${eicver}) @@ -655,6 +665,7 @@ macro(ei_test_get_compilerver_from_cxx_version_string) ei_test1_get_compilerver_from_cxx_version_string("i686-apple-darwin11-llvm-g++-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2335.15.00)" "llvm-g++" "4.2.1") ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 4.4.6" "g++" "4.4.6") ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 2011" "g++" "4.4") + ei_test1_get_compilerver_from_cxx_version_string("x86_64-w64-mingw32-g++ (GCC) 10-win32 20210110" "mingw32-g++" "10-win32") endmacro() # Split all tests listed in EIGEN_TESTS_LIST into num_splits many targets @@ -768,4 +779,4 @@ macro(ei_add_smoke_tests smoke_test_list) set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest") endif() endforeach() -endmacro(ei_add_smoke_tests) \ No newline at end of file +endmacro(ei_add_smoke_tests) -- GitLab From 47722a66f2ab8b287bf381453ed5acd47e43bcb0 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 15 Jun 2021 09:09:31 -0700 Subject: [PATCH 040/266] Fix more enum arithmetic. (cherry picked from commit 13fb5ab92c3226f7b9be20882b0418d53516d35a) --- Eigen/src/Core/BandMatrix.h | 6 ++--- Eigen/src/Core/CoreEvaluators.h | 2 +- Eigen/src/Core/SelfAdjointView.h | 2 +- Eigen/src/Core/SolveTriangular.h | 2 +- Eigen/src/Core/TriangularMatrix.h | 6 ++--- .../Core/products/SelfadjointRank2Update.h | 4 ++-- Eigen/src/Core/util/XprHelper.h | 4 ++-- .../src/Eigenvalues/HessenbergDecomposition.h | 2 +- Eigen/src/SVD/JacobiSVD.h | 8 +++---- Eigen/src/SparseCholesky/SimplicialCholesky.h | 2 +- Eigen/src/SparseCore/SparseCwiseBinaryOp.h | 6 ++--- Eigen/src/SparseCore/SparseCwiseUnaryOp.h | 2 +- test/vectorization_logic.cpp | 2 +- .../Eigen/CXX11/src/Tensor/TensorAssign.h | 16 +++++++------- .../Eigen/CXX11/src/Tensor/TensorConversion.h | 4 ++-- .../CXX11/src/Tensor/TensorConvolution.h | 4 ++-- .../Eigen/CXX11/src/Tensor/TensorEvaluator.h | 22 +++++++++---------- 17 files changed, 47 insertions(+), 47 deletions(-) diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h index 480e04495..878c0240a 100644 --- a/Eigen/src/Core/BandMatrix.h +++ b/Eigen/src/Core/BandMatrix.h @@ -67,7 +67,7 @@ class BandMatrixBase : public EigenBase * \warning the internal storage must be column major. */ inline Block col(Index i) { - EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); Index start = 0; Index len = coeffs().rows(); if (i<=supers()) @@ -90,7 +90,7 @@ class BandMatrixBase : public EigenBase template struct DiagonalIntReturnType { enum { - ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)), + ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)), Conjugate = ReturnOpposite && NumTraits::IsComplex, ActualIndex = ReturnOpposite ? -Index : Index, DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic) @@ -192,7 +192,7 @@ struct traits > Options = _Options, DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic }; - typedef Matrix CoefficientsType; + typedef Matrix CoefficientsType; }; template diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 97231f99d..0ff8c8deb 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -639,7 +639,7 @@ struct ternary_evaluator, IndexBased typedef CwiseTernaryOp XprType; enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Arg1Flags = evaluator::Flags, Arg2Flags = evaluator::Flags, diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index b7ed6f1cd..8ce3b372a 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -66,7 +66,7 @@ template class SelfAdjointView enum { Mode = internal::traits::Mode, Flags = internal::traits::Flags, - TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0) + TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0) }; typedef typename MatrixType::PlainObject PlainObject; diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 387944475..dfbf99523 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -168,7 +168,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(c { OtherDerived& other = _other.const_cast_derived(); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); - eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower))); + eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower)))); // If solving for a 0x0 matrix, nothing to do, simply return. if (derived().cols() == 0) return; diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 025a1142e..fdb8bc15a 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -53,7 +53,7 @@ template class TriangularBase : public EigenBase typedef Derived const& Nested; EIGEN_DEVICE_FUNC - inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); } + inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); } EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } @@ -853,7 +853,7 @@ struct Assignment { EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { - call_triangular_assignment_loop(dst, src, func); + call_triangular_assignment_loop(dst, src, func); } }; @@ -951,7 +951,7 @@ template EIGEN_DEVICE_FUNC void TriangularBase::evalToLazy(MatrixBase &other) const { other.derived().resize(this->rows(), this->cols()); - internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); + internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); } namespace internal { diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h index 09209f733..f752a0bf0 100644 --- a/Eigen/src/Core/products/SelfadjointRank2Update.h +++ b/Eigen/src/Core/products/SelfadjointRank2Update.h @@ -80,8 +80,8 @@ EIGEN_DEVICE_FUNC SelfAdjointView& SelfAdjointView::type>::type UType; - typedef typename internal::remove_all::type>::type VType; + typedef typename internal::remove_all::type>::type UType; + typedef typename internal::remove_all::type>::type VType; internal::selfadjoint_rank2_update_selector ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha); diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 2c63a9524..f2323174e 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -611,9 +611,9 @@ template MatrixRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType; typedef Array ArrayRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType; typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h index d947dac4e..1f2113934 100644 --- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h +++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h @@ -267,7 +267,7 @@ template class HessenbergDecomposition private: - typedef Matrix VectorType; + typedef Matrix VectorType; typedef typename NumTraits::Real RealScalar; static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp); diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index a22a2e5c3..8551a06c6 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -112,8 +112,8 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) - : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) + TrOptions = RowsAtCompileTime==1 ? (int(MatrixType::Options) & ~(int(RowMajor))) + : ColsAtCompileTime==1 ? (int(MatrixType::Options) | int(RowMajor)) : MatrixType::Options }; typedef Matrix @@ -202,8 +202,8 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) - : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) + TrOptions = RowsAtCompileTime==1 ? (int(MatrixType::Options) & ~(int(RowMajor))) + : ColsAtCompileTime==1 ? (int(MatrixType::Options) | int(RowMajor)) : MatrixType::Options }; diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h index 94c9f0f21..9f93e3255 100644 --- a/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -218,7 +218,7 @@ class SimplicialCholeskyBase : public SparseSolverBase CholMatrixType tmp(size,size); ConstCholMatrixPtr pmat; - if(m_P.size()==0 && (UpLo&Upper)==Upper) + if(m_P.size() == 0 && (int(UpLo) & int(Upper)) == Upper) { // If there is no ordering, try to directly use the input matrix without any copy internal::simplicial_cholesky_grab_input::run(a, pmat, tmp); diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index ff1e49c45..9b0d3f98d 100644 --- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -211,7 +211,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -298,7 +298,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -457,7 +457,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index edf844cb8..32dac0f78 100644 --- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -79,7 +79,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 65c653c94..97c0bdad9 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -324,7 +324,7 @@ struct vectorization_logic_half EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling)); VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(), - EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling)); if(PacketSize>1) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index ca0453f79..e5811d63f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -104,14 +104,14 @@ struct TensorEvaluator, Device> static const int NumDims = XprType::NumDims; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess | - TensorEvaluator::PreferBlockAccess, + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), Layout = TensorEvaluator::Layout, RawAccess = TensorEvaluator::RawAccess }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 4968babc1..09d2da9a8 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -402,8 +402,8 @@ struct TensorEvaluator, Device> const bool Vectorizable = IsSameType ? TensorEvaluator::PacketAccess - : TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast; + : int(TensorEvaluator::PacketAccess) & + int(internal::type_casting_traits::VectorizedCast); return internal::PacketConv::run(m_impl, index); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 1b71023c0..b20f80ba2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -307,8 +307,8 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + IsAligned = int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & int(TensorEvaluator::PacketAccess), BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 35fe643ea..3aff7fa01 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -445,8 +445,8 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), BlockAccess = TensorEvaluator::BlockAccess, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -556,15 +556,15 @@ struct TensorEvaluator XprType; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess | - TensorEvaluator::PreferBlockAccess, + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false -- GitLab From 2d6eaaf687055e17e399fee0b24edcc25fd9fef5 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 15 Jun 2021 12:15:58 -0700 Subject: [PATCH 041/266] Fix placement of permanent GPU defines. (cherry picked from commit 954879183b1e008d7f0fefb97e48a925c4e3fb16) --- unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h | 4 ---- .../Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h index a89ea3e9a..cb53ce298 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -11,8 +11,6 @@ #if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) #define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H -#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES - // Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design // There is code in the Tensorflow codebase that will define EIGEN_USE_GPU, but // for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler @@ -98,6 +96,4 @@ #endif // gpu_assert -#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES - #endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h index db394bcbb..1d142f2ee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h @@ -10,6 +10,8 @@ #if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) +#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + #undef gpuStream_t #undef gpuDeviceProp_t #undef gpuError_t @@ -35,6 +37,8 @@ #undef gpuDeviceSynchronize #undef gpuMemcpy +#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + #undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H #endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H -- GitLab From 1374f49f28aa93772271789e652704399eaac850 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 15 Jun 2021 13:42:17 -0700 Subject: [PATCH 042/266] Add missing ppc pcmp_lt_or_nan (cherry picked from commit 9e94c5957000c38a6553552c96a7a27b1fc2860d) --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index d4aee3e21..a3ebf9ec2 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -1411,6 +1411,9 @@ template<> EIGEN_STRONG_INLINE Packet8bf pmax(const Packet8bf& a, con template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) { BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt, a, b); } +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan, a, b); +} template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) { BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le, a, b); } -- GitLab From 9fc93ce31acb80430d5945f28b8be09f7bdbe58c Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Wed, 16 Jun 2021 08:49:22 -0500 Subject: [PATCH 043/266] EIGEN_STRONG_INLINE was NOT inlining in some critical needed areas (6.6X slowdown) when used with Tensorflow. Changing to EIGEN_ALWAYS_INLINE where appropiate. (cherry picked from commit ef1fd341a895fda883f655102f371fa8b41f2088) --- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 74 +++++++++---------- .../Core/arch/AltiVec/MatrixProductCommon.h | 26 +++---- .../src/Core/arch/AltiVec/MatrixProductMMA.h | 28 +++---- 3 files changed, 59 insertions(+), 69 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index dbdb81ef1..4c5cf1762 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -113,7 +113,7 @@ const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, * float32/64 and complex float32/64 version. **/ template -EIGEN_STRONG_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) +EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) { std::complex v; if(i < j) @@ -403,7 +403,7 @@ struct symm_pack_lhs **/ template -EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); @@ -413,7 +413,7 @@ EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) } template -EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); @@ -992,7 +992,7 @@ struct dhs_cpack -EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { @@ -1009,7 +1009,7 @@ EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& l } template -EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { @@ -1020,7 +1020,7 @@ EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& l } template -EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) { Packet lhsV = pload(lhs); @@ -1028,7 +1028,7 @@ EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con } template -EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); @@ -1041,7 +1041,7 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, In } template -EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) { Packet lhsV; loadPacketRemaining(lhs, lhsV, remaining_rows); @@ -1051,7 +1051,7 @@ EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. template -EIGEN_STRONG_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) +EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) { pger_common(accReal, lhsV, rhsV); if(LhsIsReal) @@ -1070,7 +1070,7 @@ EIGEN_STRONG_INLINE void pgerc_common(PacketBlock* accReal, PacketBloc } template -EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) { Packet lhsV = ploadLhs(lhs_ptr); Packet lhsVi; @@ -1081,7 +1081,7 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock -EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); @@ -1098,7 +1098,7 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar } template -EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) { Packet lhsV, lhsVi; loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); @@ -1107,14 +1107,14 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock -EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs) +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) { return *reinterpret_cast(const_cast(lhs)); } // Zero the accumulator on PacketBlock. template -EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); acc.packet[1] = pset1((Scalar)0); @@ -1123,14 +1123,14 @@ EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) } template -EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); } // Scale the PacketBlock vectors by alpha. template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); @@ -1139,13 +1139,13 @@ EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); } template -EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmul(accZ.packet[0], pAlpha); acc.packet[1] = pmul(accZ.packet[1], pAlpha); @@ -1154,14 +1154,14 @@ EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock< } template -EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmul(accZ.packet[0], pAlpha); } // Complex version of PacketBlock scaling. template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) { bscalec_common(cReal, aReal, bReal); @@ -1173,7 +1173,7 @@ EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock -EIGEN_STRONG_INLINE void band(PacketBlock& acc, const Packet& pMask) +EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) { acc.packet[0] = pand(acc.packet[0], pMask); acc.packet[1] = pand(acc.packet[1], pMask); @@ -1182,7 +1182,7 @@ EIGEN_STRONG_INLINE void band(PacketBlock& acc, const Packet& pMask) } template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) { band(aReal, pMask); band(aImag, pMask); @@ -1192,7 +1192,7 @@ EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); @@ -1209,7 +1209,7 @@ EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res // An overload of bload when you have a PacketBLock with 8 vectors. template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); @@ -1233,7 +1233,7 @@ EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res } template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); @@ -1246,7 +1246,7 @@ const static Packet4i mask43 = { -1, -1, -1, 0 }; const static Packet2l mask21 = { -1, 0 }; template -EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows) +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(float(0.0)); // Not used @@ -1260,7 +1260,7 @@ EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows) } template<> -EIGEN_STRONG_INLINE Packet2d bmask(const int remaining_rows) +EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(double(0.0)); // Not used @@ -1270,7 +1270,7 @@ EIGEN_STRONG_INLINE Packet2d bmask(const int remaining_rows) } template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) { band(accZ, pMask); @@ -1278,13 +1278,13 @@ EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock -EIGEN_STRONG_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) { pbroadcast4(a, a0, a1, a2, a3); } template<> -EIGEN_STRONG_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) { a1 = pload(a); a3 = pload(a + 2); @@ -1298,7 +1298,7 @@ EIGEN_STRONG_INLINE void pbroadcast4_old(const double* a, Packet2d& a0 #define PEEL 7 template -EIGEN_STRONG_INLINE void MICRO_EXTRA_COL( +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, PacketBlock &accZero, @@ -1362,7 +1362,7 @@ EIGEN_STRONG_INLINE void gemm_extra_col( } template -EIGEN_STRONG_INLINE void MICRO_EXTRA_ROW( +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, PacketBlock &accZero, @@ -1565,7 +1565,6 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( Index col, const Packet& pAlpha) { -asm("#gemm begin"); const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7; PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; @@ -1588,7 +1587,6 @@ asm("#gemm begin"); MICRO_STORE row += unroll_factor*accCols; -asm("#gemm end"); } template @@ -1789,7 +1787,7 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const #define PEEL_COMPLEX 3 template -EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_COL( +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, PacketBlock &accReal, PacketBlock &accImag, @@ -1888,7 +1886,7 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_col( } template -EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_ROW( +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, PacketBlock &accReal, PacketBlock &accImag, @@ -1924,7 +1922,6 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( const Packet& pAlphaImag, const Packet& pMask) { -asm("#gemm_complex begin"); const Scalar* rhs_ptr_real = rhs_base; const Scalar* rhs_ptr_imag; if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; @@ -2001,7 +1998,6 @@ asm("#gemm_complex begin"); } } } -asm("#gemm_complex end"); } #define MICRO_COMPLEX_UNROLL(func) \ @@ -2173,7 +2169,6 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( const Packet& pAlphaReal, const Packet& pAlphaImag) { -asm("#gemm_complex_unrolled begin"); const Scalar* rhs_ptr_real = rhs_base; const Scalar* rhs_ptr_imag; if(!RhsIsReal) { @@ -2211,7 +2206,6 @@ asm("#gemm_complex_unrolled begin"); MICRO_COMPLEX_STORE row += unroll_factor*accCols; -asm("#gemm_complex_unrolled end"); } template diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index 6e74116b9..41b27bf3d 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -54,7 +54,7 @@ EIGEN_STRONG_INLINE void gemm_unrolled_col( const Packet& pAlpha); template -EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows); +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); template EIGEN_STRONG_INLINE void gemm_complex_extra_col( @@ -107,19 +107,19 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( const Packet& pAlphaImag); template -EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs); +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, 16, 17, 18, 19, @@ -141,7 +141,7 @@ const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. template -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); @@ -155,7 +155,7 @@ EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketB } template -EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) { bcouple_common(taccReal, taccImag, acc1, acc2); @@ -171,7 +171,7 @@ EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); @@ -179,7 +179,7 @@ EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketB } template -EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) { bcouple_common(taccReal, taccImag, acc1, acc2); @@ -189,7 +189,7 @@ EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); @@ -203,7 +203,7 @@ EIGEN_STRONG_INLINE void bcouple_common(PacketBlock -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); @@ -212,7 +212,7 @@ EIGEN_STRONG_INLINE void bcouple_common(PacketBlock -EIGEN_STRONG_INLINE Packet ploadRhs(const Scalar* rhs) +EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs) { return *reinterpret_cast(const_cast(rhs)); } diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 08855bd01..13d9517e4 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -24,13 +24,13 @@ namespace Eigen { namespace internal { template -EIGEN_STRONG_INLINE void bsetzeroMMA(__vector_quad* acc) +EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) { __builtin_mma_xxsetaccz(acc); } template -EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) { PacketBlock result; __builtin_mma_disassemble_acc(&result.packet, acc); @@ -44,7 +44,7 @@ EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& da } template -EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) +EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) { PacketBlock resultReal, resultImag; __builtin_mma_disassemble_acc(&resultReal.packet, accReal); @@ -65,7 +65,7 @@ EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMap // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) { if(NegativeAccumulate) { @@ -76,7 +76,7 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const L } template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) { __vector_pair* a0 = (__vector_pair *)(&a.packet[0]); if(NegativeAccumulate) @@ -88,7 +88,7 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) { if(NegativeAccumulate) { @@ -99,13 +99,13 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, con } template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&) { // Just for compilation } template -EIGEN_STRONG_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi) +EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi) { pgerMMA(accReal, rhsV, lhsV); if(LhsIsReal) { @@ -123,20 +123,20 @@ EIGEN_STRONG_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. template -EIGEN_STRONG_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) { rhsV = ploadRhs((const Scalar*)(rhs)); } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) { rhsV.packet[0] = ploadRhs((const double *)((Packet2d *)rhs )); rhsV.packet[1] = ploadRhs((const double *)(((Packet2d *)rhs) + 1)); } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) { #if EIGEN_COMP_LLVM __builtin_vsx_assemble_pair(&rhsV, @@ -148,7 +148,7 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const double* rhs, _ } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA(const float*, __vector_pair&) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) { // Just for compilation } @@ -255,7 +255,6 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( Index col, const Packet& pAlpha) { -asm("#gemm_MMA begin"); const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7; __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; @@ -277,7 +276,6 @@ asm("#gemm_MMA begin"); MICRO_MMA_STORE row += unroll_factor*accCols; -asm("#gemm_MMA end"); } template @@ -505,7 +503,6 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( const Packet& pAlphaReal, const Packet& pAlphaImag) { -asm("#gemm_complex_MMA begin"); const Scalar* rhs_ptr_real = rhs_base; const Scalar* rhs_ptr_imag; if(!RhsIsReal) { @@ -538,7 +535,6 @@ asm("#gemm_complex_MMA begin"); MICRO_COMPLEX_MMA_STORE row += unroll_factor*accCols; -asm("#gemm_complex_MMA end"); } template -- GitLab From ee4e099aa24e60965f5bb2accb15e0371be04e97 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 16 Jun 2021 14:36:42 -0700 Subject: [PATCH 044/266] Remove pset, replace with ploadu. We can't make guarantees on alignment for existing calls to `pset`, so we should default to loading unaligned. But in that case, we should just use `ploadu` directly. For loading constants, this load should hopefully get optimized away. This is causing segfaults in Google Maps. (cherry picked from commit 12e8d57108c50d8a63605c6eb0144c838c128337) --- .../arch/Default/GenericPacketMathFunctions.h | 6 ------ .../Default/GenericPacketMathFunctionsFwd.h | 4 ---- Eigen/src/Geometry/arch/Geometry_SIMD.h | 18 ++++++++++-------- Eigen/src/LU/arch/InverseSize4.h | 6 +++--- 4 files changed, 13 insertions(+), 21 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 8f1c1a874..bb4f719a8 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -19,12 +19,6 @@ namespace Eigen { namespace internal { -template EIGEN_DEVICE_FUNC inline Packet -pset(const typename unpacket_traits::type (&a)[N] /* a */) { - EIGEN_STATIC_ASSERT(unpacket_traits::size == N, THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE); - return pload(a); -} - // Creates a Scalar integer type with same bit-width. template struct make_integer; template<> struct make_integer { typedef numext::int32_t type; }; diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 637e5f4af..177a04e93 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -17,10 +17,6 @@ namespace internal { // implemented in GenericPacketMathFunctions.h // This is needed to workaround a circular dependency. -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a[N-1],...,a[0]) */ -template EIGEN_DEVICE_FUNC inline Packet -pset(const typename unpacket_traits::type (&a)[N] /* a */); - /*************************************************************************** * Some generic implementations to be used by implementors ***************************************************************************/ diff --git a/Eigen/src/Geometry/arch/Geometry_SIMD.h b/Eigen/src/Geometry/arch/Geometry_SIMD.h index 89ac92062..9af6a9af7 100644 --- a/Eigen/src/Geometry/arch/Geometry_SIMD.h +++ b/Eigen/src/Geometry/arch/Geometry_SIMD.h @@ -28,8 +28,9 @@ struct quat_product evaluator ae(_a.coeffs()); evaluator be(_b.coeffs()); Quaternion res; - float arr[4] = {0.f, 0.f, 0.f, -0.f}; - const Packet4f mask = pset(arr); + const float neg_zero = numext::bit_cast(0x80000000u); + const float arr[4] = {0.f, 0.f, 0.f, neg_zero}; + const Packet4f mask = ploadu(arr); Packet4f a = ae.template packet(0); Packet4f b = be.template packet(0); Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); @@ -55,8 +56,9 @@ struct quat_conj { evaluator qe(q.coeffs()); Quaternion res; - float arr[4] = {-0.f,-0.f,-0.f,0.f}; - const Packet4f mask = pset(arr); + const float neg_zero = numext::bit_cast(0x80000000u); + const float arr[4] = {neg_zero, neg_zero, neg_zero,0.f}; + const Packet4f mask = ploadu(arr); pstoret(&res.x(), pxor(mask, qe.template packet::Alignment,Packet4f>(0))); return res; } @@ -147,10 +149,10 @@ struct quat_conj evaluator qe(q.coeffs()); Quaternion res; const double neg_zero = numext::bit_cast(0x8000000000000000ull); - double arr1[2] = {neg_zero, neg_zero}; - double arr2[2] = {neg_zero, 0.0}; - const Packet2d mask0 = pset(arr1); - const Packet2d mask2 = pset(arr2); + const double arr1[2] = {neg_zero, neg_zero}; + const double arr2[2] = {neg_zero, 0.0}; + const Packet2d mask0 = ploadu(arr1); + const Packet2d mask2 = ploadu(arr2); pstoret(&res.x(), pxor(mask0, qe.template packet::Alignment,Packet2d>(0))); pstoret(&res.z(), pxor(mask2, qe.template packet::Alignment,Packet2d>(2))); return res; diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index 106224bbc..a232ffc0a 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -144,7 +144,7 @@ struct compute_inverse_size4(0x80000000u), numext::bit_cast(0x80000000u), 0.0f}; - const Packet4f p4f_sign_PNNP = pset(sign_mask); + const Packet4f p4f_sign_PNNP = ploadu(sign_mask); rd = pxor(rd, p4f_sign_PNNP); iA = pmul(iA, rd); iB = pmul(iB, rd); @@ -328,8 +328,8 @@ struct compute_inverse_size4(0x8000000000000000ull)}; const double sign_mask2[2] = {numext::bit_cast(0x8000000000000000ull), 0.0}; - const Packet2d sign_PN = pset(sign_mask1); - const Packet2d sign_NP = pset(sign_mask2); + const Packet2d sign_PN = ploadu(sign_mask1); + const Packet2d sign_NP = ploadu(sign_mask2); d1 = pxor(rd, sign_PN); d2 = pxor(rd, sign_NP); -- GitLab From c2c0f6f64b5c512724b9979d2841b49547f28104 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 18 Jun 2021 13:06:04 -0700 Subject: [PATCH 045/266] Fix fix<> for gcc-4.9.3. There's a missing `EIGEN_HAS_CXX14` -> `EIGEN_HAS_CXX14_VARIABLE_TEMPLATES` replacement. Fixes ##2267 (cherry picked from commit 35a367d557078462a0793c88c44dcad64fc63698) --- Eigen/src/Core/util/IntegralConstant.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index d457e02ee..945d426ea 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -77,7 +77,7 @@ public: template FixedInt operator&( FixedInt) const { return FixedInt(); } -#if EIGEN_HAS_CXX14 +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES // Needed in C++14 to allow fix(): FixedInt operator() () const { return *this; } -- GitLab From a2040ef796153edd1a1d7aea3b0c61252c84b615 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 18 Jun 2021 14:24:11 -0700 Subject: [PATCH 046/266] Rewrite balancer to avoid overflows. The previous balancer overflowed for large row/column norms. Modified to prevent that. Fixes #2273. (cherry picked from commit e9ab4278b7aba6f279c964d99ae5a312d12ab04b) --- unsupported/Eigen/src/Polynomials/Companion.h | 47 ++++++++++--------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h index 6ab8f9714..59a15b098 100644 --- a/unsupported/Eigen/src/Polynomials/Companion.h +++ b/unsupported/Eigen/src/Polynomials/Companion.h @@ -20,12 +20,6 @@ namespace internal { #ifndef EIGEN_PARSED_BY_DOXYGEN -template -T radix(){ return 2; } - -template -T radix2(){ return radix()*radix(); } - template struct decrement_if_fixed_size { @@ -141,7 +135,10 @@ inline bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm, bool& isBalanced, RealScalar& colB, RealScalar& rowB ) { - if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; } + if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm + || !(numext::isfinite)(colNorm) || !(numext::isfinite)(rowNorm)){ + return true; + } else { //To find the balancing coefficients, if the radix is 2, @@ -149,33 +146,41 @@ bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm, // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$ // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$ // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$ - rowB = rowNorm / radix(); + const RealScalar radix = RealScalar(2); + const RealScalar radix2 = RealScalar(4); + + rowB = rowNorm / radix; colB = RealScalar(1); const RealScalar s = colNorm + rowNorm; - while (colNorm < rowB) + // Find sigma s.t. rowNorm / 2 <= 2^(2*sigma) * colNorm + RealScalar scout = colNorm; + while (scout < rowB) { - colB *= radix(); - colNorm *= radix2(); + colB *= radix; + scout *= radix2; } - - rowB = rowNorm * radix(); - - while (colNorm >= rowB) + + // We now have an upper-bound for sigma, try to lower it. + // Find sigma s.t. 2^(2*sigma) * colNorm / 2 < rowNorm + scout = colNorm * (colB / radix) * colB; // Avoid overflow. + while (scout >= rowNorm) { - colB /= radix(); - colNorm /= radix2(); + colB /= radix; + scout /= radix2; } - //This line is used to avoid insubstantial balancing - if ((rowNorm + colNorm) < RealScalar(0.95) * s * colB) + // This line is used to avoid insubstantial balancing. + if ((rowNorm + radix * scout) < RealScalar(0.95) * s * colB) { isBalanced = false; rowB = RealScalar(1) / colB; return false; } - else{ - return true; } + else + { + return true; + } } } -- GitLab From fd5d23fdf3c4d9ff638d5afdfb577bc4bd9b1eed Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 21 Jun 2021 19:06:25 +0000 Subject: [PATCH 047/266] Update ComplexEigenSolver_eigenvectors.cpp (cherry picked from commit ea62c937edcc2c5efdaccfb6813ca39f48564ece) --- doc/snippets/ComplexEigenSolver_eigenvectors.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/snippets/ComplexEigenSolver_eigenvectors.cpp b/doc/snippets/ComplexEigenSolver_eigenvectors.cpp index bb1c2ccf1..adeed9af6 100644 --- a/doc/snippets/ComplexEigenSolver_eigenvectors.cpp +++ b/doc/snippets/ComplexEigenSolver_eigenvectors.cpp @@ -1,4 +1,4 @@ MatrixXcf ones = MatrixXcf::Ones(3,3); ComplexEigenSolver ces(ones); cout << "The first eigenvector of the 3x3 matrix of ones is:" - << endl << ces.eigenvectors().col(1) << endl; + << endl << ces.eigenvectors().col(0) << endl; -- GitLab From 4780d8dfb2b0e9bcff063c80f4ffce71d9d7a725 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 21 Jun 2021 19:06:04 +0000 Subject: [PATCH 048/266] Fix typo in SelfAdjointEigenSolver_eigenvectors.cpp (cherry picked from commit c8a2b4d20a162dc2527425f40cf7df95db5ba428) --- doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp b/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp index cfc8b0d54..94b0d6ebd 100644 --- a/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp +++ b/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp @@ -1,4 +1,4 @@ MatrixXd ones = MatrixXd::Ones(3,3); SelfAdjointEigenSolver es(ones); cout << "The first eigenvector of the 3x3 matrix of ones is:" - << endl << es.eigenvectors().col(1) << endl; + << endl << es.eigenvectors().col(0) << endl; -- GitLab From a235ddef394602229a938765fc2e5fe12e5b73e1 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 24 Jun 2021 15:47:48 -0700 Subject: [PATCH 049/266] Get rid of code duplication for conj_helper. For packets where LhsType=RhsType a single generic implementation suffices. For scalars, the generic implementation of pconj automatically forwards to numext::conj, so much of the existing specialization can be avoided. For mixed types we still need specializations. (cherry picked from commit 52a5f9821235e5a9f7e9b3e0198d45d42a1cb267) --- Eigen/src/Core/arch/AVX/Complex.h | 66 ------------- Eigen/src/Core/arch/AVX512/Complex.h | 33 ------- Eigen/src/Core/arch/AltiVec/Complex.h | 70 +------------ Eigen/src/Core/arch/Default/ConjHelper.h | 106 +++++++++++++++++--- Eigen/src/Core/arch/MSA/Complex.h | 72 -------------- Eigen/src/Core/arch/NEON/Complex.h | 87 +---------------- Eigen/src/Core/arch/SSE/Complex.h | 119 ++--------------------- Eigen/src/Core/arch/ZVector/Complex.h | 103 +------------------- Eigen/src/Core/util/BlasUtil.h | 84 ---------------- 9 files changed, 105 insertions(+), 635 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 506ca0be5..ab7bd6c65 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -167,39 +167,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) @@ -350,39 +317,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd(_mm256_extractf128_pd(a.v,1)))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 45f22f436..49c72b3f1 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -153,39 +153,6 @@ EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4(const Packet8cf& a) return Packet4cf(res); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f) template<> EIGEN_STRONG_INLINE Packet8cf pdiv(const Packet8cf& a, const Packet8cf& b) diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index c6cb59e8f..e1711930b 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -206,45 +206,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(prod); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for AltiVec - Packet2cf res = conj_helper().pmul(a, b); + Packet2cf res = pmul(a, pconj(b)); Packet4f s = pmul(b.v, b.v); return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); } @@ -404,45 +371,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = pmul(b.v, b.v); return Packet1cd(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_REVERSE64)))); } diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h index 4cfe34e05..99783b4ec 100644 --- a/Eigen/src/Core/arch/Default/ConjHelper.h +++ b/Eigen/src/Core/arch/Default/ConjHelper.h @@ -11,19 +11,97 @@ #ifndef EIGEN_ARCH_CONJ_HELPER_H #define EIGEN_ARCH_CONJ_HELPER_H -#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); } \ - }; \ - \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); } \ +#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, \ + const PACKET_CPLX& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, \ + const PACKET_CPLX& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); \ + } \ + }; \ + \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, \ + const PACKET_REAL& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, \ + const PACKET_REAL& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); \ + } \ }; -#endif // EIGEN_ARCH_CONJ_HELPER_H +namespace Eigen { +namespace internal { + +template struct conj_if; + +template<> struct conj_if { + template + inline T operator()(const T& x) const { return numext::conj(x); } + template + inline T pconj(const T& x) const { return internal::pconj(x); } +}; + +template<> struct conj_if { + template + inline const T& operator()(const T& x) const { return x; } + template + inline const T& pconj(const T& x) const { return x; } +}; + +// Generic implementation. +template +struct conj_helper +{ + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const + { return Eigen::internal::pmadd(conj_if().pconj(x), conj_if().pconj(y), c); } + + EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const + { return Eigen::internal::pmul(conj_if().pconj(x), conj_if().pconj(y)); } +}; + +template +struct conj_helper +{ + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const + { return Eigen::internal::pmadd(pconj(x), pconj(y), c); } + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const + { return pconj(Eigen::internal::pmul(x, y)); } +}; + +// Generic implementation for mixed products of complex scalar types. +template struct conj_helper, RealScalar, Conj,false> +{ + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const + { return c + conj_if().pconj(x) * y; } + EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const + { return conj_if().pconj(x) * y; } +}; + +template struct conj_helper, false,Conj> +{ + typedef std::complex Scalar; + EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const + { return c + pmul(x,y); } + EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const + { return x * conj_if().pconj(y); } +}; + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h index 4877a95a8..53dacfa43 100644 --- a/Eigen/src/Core/arch/MSA/Complex.h +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -305,42 +305,6 @@ EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a (a.v[0] * a.v[3]) + (a.v[1] * a.v[2])); } -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, - const Packet2cf& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { - return internal::pmul(a, pconj(b)); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, - const Packet2cf& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { - return internal::pmul(pconj(a), b); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, - const Packet2cf& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f) template <> @@ -644,42 +608,6 @@ EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& return pfirst(a); } -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, - const Packet1cd& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { - return internal::pmul(a, pconj(b)); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, - const Packet1cd& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { - return internal::pmul(pconj(a), b); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, - const Packet1cd& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d) template <> diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index a889ab1d2..f40af7f87 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -342,67 +342,13 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return s; } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const - { return internal::pmul(a, pconj(b)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const - { return internal::pmul(pconj(a), b); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const - { return pconj(internal::pmul(a,b)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { return internal::pmul(a, pconj(b)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { return internal::pmul(pconj(a), b); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { return pconj(internal::pmul(a,b)); } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f) EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, const Packet1cf& b) { // TODO optimize it for NEON - Packet1cf res = conj_helper().pmul(a,b); + Packet1cf res = pmul(a, pconj(b)); Packet2f s, rev_s; // this computes the norm @@ -414,7 +360,7 @@ template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = pmul(a,pconj(b)); Packet4f s, rev_s; // this computes the norm @@ -603,39 +549,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { return internal::pmul(a, pconj(b)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { return internal::pmul(pconj(a), b); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { return pconj(internal::pmul(a,b)); } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for NEON - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = pmul(b.v, b.v); Packet2d rev_s = preverse(s); diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 1cab374c0..8fe22da46 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -165,74 +165,21 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), - _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); - #endif - } -}; - -template<> struct conj_helper +EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; + return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for SSE3 and 4 - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = pmul(a, pconj(b)); __m128 s = _mm_mul_ps(b.v,b.v); - return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2)))); } -EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) -{ - return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); -} //---------- double ---------- @@ -348,66 +295,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const return pfirst(a); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), - _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)), mask))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); __m128d s = _mm_mul_pd(b.v,b.v); return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index ddf5a97d8..0b9b33d99 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -165,45 +165,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const { return pfirst(a); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); } @@ -337,39 +304,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return res; } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) @@ -456,45 +390,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(prod); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for AltiVec - Packet2cf res = conj_helper().pmul(a, b); + Packet2cf res = pmul(a, pconj(b)); Packet4f s = pmul(b.v, b.v); return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); } diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index c5161022c..44206326c 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -39,90 +39,6 @@ template struct general_matrix_vector_product; - -template struct conj_if; - -template<> struct conj_if { - template - inline T operator()(const T& x) const { return numext::conj(x); } - template - inline T pconj(const T& x) const { return internal::pconj(x); } -}; - -template<> struct conj_if { - template - inline const T& operator()(const T& x) const { return x; } - template - inline const T& pconj(const T& x) const { return x; } -}; - -// Generic implementation for custom complex types. -template -struct conj_helper -{ - typedef typename ScalarBinaryOpTraits::ReturnType Scalar; - - EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const - { return conj_if()(x) * conj_if()(y); } -}; - -template struct conj_helper -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); } -}; - -template struct conj_helper, std::complex, false,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); } -}; - -template struct conj_helper, std::complex, true,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, std::complex, true,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, RealScalar, Conj,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const - { return conj_if()(x)*y; } -}; - -template struct conj_helper, false,Conj> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const - { return x*conj_if()(y); } -}; - template struct get_factor { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); } }; -- GitLab From 413ff2b53166c8abc3a5330fe4c4cf41629b7bf1 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 24 Jun 2021 18:52:17 -0700 Subject: [PATCH 050/266] Small cleanup: Get rid of the macros EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD and CJMADD, which were effectively unused, apart from on x86, where the change results in identically performing code. (cherry picked from commit bffd267d176410a517a0fe9afa6dde99c213c08a) --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 4 -- Eigen/src/Core/arch/MSA/PacketMath.h | 4 -- Eigen/src/Core/arch/NEON/PacketMath.h | 4 -- Eigen/src/Core/arch/SVE/PacketMath.h | 4 -- Eigen/src/Core/arch/ZVector/PacketMath.h | 4 -- .../Core/products/GeneralBlockPanelKernel.h | 46 +++---------------- 6 files changed, 7 insertions(+), 59 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index a3ebf9ec2..8c42f495c 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -22,10 +22,6 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h index f03cf61ff..afe8f3375 100644 --- a/Eigen/src/Core/arch/MSA/PacketMath.h +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -28,10 +28,6 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e1efe9bcb..5e7702a50 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -24,10 +24,6 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #if EIGEN_ARCH_ARM64 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h index 4877b6d80..9060b372f 100644 --- a/Eigen/src/Core/arch/SVE/PacketMath.h +++ b/Eigen/src/Core/arch/SVE/PacketMath.h @@ -22,10 +22,6 @@ namespace internal #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 template diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 2246439cc..1f55a90a5 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -22,10 +22,6 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 8362ecc02..1116321a9 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -349,36 +349,6 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ computeProductBlockingSizes(k, m, n, num_threads); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD - #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); -#else - - // FIXME (a bit overkill maybe ?) - - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) - { - c = cj.pmadd(a,b,c); - } - }; - - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) - { - t = b; t = cj.pmul(a,t); c = padd(c,t); - } - }; - - template - EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) - { - gebp_madd_selector::run(cj,a,b,c,t); - } - - #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); -#endif - template struct RhsPanelHelper { private: @@ -2060,14 +2030,14 @@ void gebp_kernel Date: Fri, 25 Jun 2021 19:28:00 +0000 Subject: [PATCH 051/266] Commit 52a5f982 broke conjhelper functionality for HIP GPUs. This commit addresses this. (cherry picked from commit 2d132d17365ffc84c0cc7a7da9b8f7090e94b476) --- Eigen/src/Core/arch/Default/ConjHelper.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h index 99783b4ec..255daddc5 100644 --- a/Eigen/src/Core/arch/Default/ConjHelper.h +++ b/Eigen/src/Core/arch/Default/ConjHelper.h @@ -45,16 +45,16 @@ template struct conj_if; template<> struct conj_if { template - inline T operator()(const T& x) const { return numext::conj(x); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); } template - inline T pconj(const T& x) const { return internal::pconj(x); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); } }; template<> struct conj_if { template - inline const T& operator()(const T& x) const { return x; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; } template - inline const T& pconj(const T& x) const { return x; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; } }; // Generic implementation. @@ -63,10 +63,10 @@ struct conj_helper { typedef typename ScalarBinaryOpTraits::ReturnType ResultType; - EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const { return Eigen::internal::pmadd(conj_if().pconj(x), conj_if().pconj(y), c); } - EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const { return Eigen::internal::pmul(conj_if().pconj(x), conj_if().pconj(y)); } }; @@ -75,10 +75,10 @@ struct conj_helper { typedef typename ScalarBinaryOpTraits::ReturnType ResultType; - EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const { return Eigen::internal::pmadd(pconj(x), pconj(y), c); } // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). - EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const { return pconj(Eigen::internal::pmul(x, y)); } }; @@ -86,18 +86,18 @@ struct conj_helper template struct conj_helper, RealScalar, Conj,false> { typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const { return c + conj_if().pconj(x) * y; } - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const { return conj_if().pconj(x) * y; } }; template struct conj_helper, false,Conj> { typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const { return c + pmul(x,y); } - EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const { return x * conj_if().pconj(y); } }; -- GitLab From 380d0e49161f840fe4c34caaddd8484feb949129 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 29 Jun 2021 23:26:15 +0000 Subject: [PATCH 052/266] Get rid of redundant `pabs` instruction in complex square root. (cherry picked from commit 5aebbe9098f53f01c99eed67b52725397e955280) --- Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index bb4f719a8..c9fbaf68b 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -802,9 +802,8 @@ Packet psqrt_complex(const Packet& a) { // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)), // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1. - Packet a_flip = pcplxflip(a); RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|] - RealPacket a_abs_flip = pabs(a_flip.v); // [|y0|, |x0|, |y1|, |x1|] + RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|] RealPacket a_max = pmax(a_abs, a_abs_flip); RealPacket a_min = pmin(a_abs, a_abs_flip); RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min)); -- GitLab From d82d9150477e8fdb1b13635de9139363dc3cd9bb Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 25 Jun 2021 14:22:19 -0700 Subject: [PATCH 053/266] Modify tensor argmin/argmax to always return first occurence. As written, depending on multithreading/gpu, the returned index from `argmin`/`argmax` is not currently stable. Here we modify the functors to always keep the first occurence (i.e. if the value is equal to the current min/max, then keep the one with the smallest index). This is otherwise causing unpredictable results in some TF tests. (cherry picked from commit 3a087ccb99b454dc34484333e608e836e7032213) --- .../Eigen/CXX11/src/Tensor/TensorFunctors.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index fd8fa00fa..3b2100ab0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -365,12 +365,16 @@ struct reducer_traits { }; }; - -// Argmin/Argmax reducers +// Argmin/Argmax reducers. Returns the first occurrence if multiple locations +// contain the same min/max value. template struct ArgMaxTupleReducer { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t.second > accum->second) { *accum = t; } + if (t.second < accum->second) { + return; + } else if (t.second > accum->second || t.first < accum->first) { + *accum = t; + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { return T(0, NumTraits::lowest()); @@ -394,7 +398,11 @@ struct reducer_traits, Device> { template struct ArgMinTupleReducer { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { - if (t.second < accum->second) { *accum = t; } + if (t.second > accum->second) { + return; + } else if (t.second < accum->second || t.first < accum->first) { + *accum = t; + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { return T(0, NumTraits::highest()); -- GitLab From 94e2250b36f22e821507b4e6e47ce36236bf65da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A7=E6=B2=B3=E3=83=A1=E3=82=BF=E3=83=AB?= Date: Wed, 30 Jun 2021 04:09:46 +0000 Subject: [PATCH 054/266] Correct declarations for aarch64-pc-windows-msvc (cherry picked from commit c81da59a252b3479753b2eada26ee0cf46280bd0) --- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 5e7702a50..d2aeef430 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -32,7 +32,7 @@ namespace internal { #endif #endif -#if EIGEN_COMP_MSVC +#if EIGEN_COMP_MSVC_STRICT // In MSVC's arm_neon.h header file, all NEON vector types // are aliases to the same underlying type __n128. @@ -78,7 +78,7 @@ typedef uint32x4_t Packet4ui; typedef int64x2_t Packet2l; typedef uint64x2_t Packet2ul; -#endif // EIGEN_COMP_MSVC +#endif // EIGEN_COMP_MSVC_STRICT EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ const float* a = reinterpret_cast(&m); -- GitLab From 517294d6e1153dc57998337f69d2c37931f33fab Mon Sep 17 00:00:00 2001 From: Alexander Karatarakis Date: Wed, 30 Jun 2021 04:27:51 +0000 Subject: [PATCH 055/266] Make DenseStorage<> trivially_copyable (cherry picked from commit 60400334a92268272c6bf525da89eec5e99c3e5a) --- Eigen/src/Core/DenseStorage.h | 13 +++++++++++++ test/dense_storage.cpp | 11 +++++++++++ 2 files changed, 24 insertions(+) diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 9acca6c90..8a4cbd4be 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -214,17 +214,26 @@ template class DenseSt EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(internal::constructor_without_unaligned_array_assert()) {} +#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) } +#else + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) = default; +#endif +#if !EIGEN_HAS_CXX11 EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { if (this != &other) m_data = other.m_data; return *this; } +#else + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) = default; +#endif #if EIGEN_HAS_RVALUE_REFERENCES +#if !EIGEN_HAS_CXX11 EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT : m_data(std::move(other.m_data)) { @@ -235,6 +244,10 @@ template class DenseSt m_data = std::move(other.m_data); return *this; } +#else + EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT = default; + EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT = default; +#endif #endif EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp index 36ccbb02c..45c2bd728 100644 --- a/test/dense_storage.cpp +++ b/test/dense_storage.cpp @@ -13,6 +13,17 @@ #include +#if EIGEN_HAS_TYPE_TRAITS && EIGEN_HAS_CXX11 +using DenseStorageD3x3 = Eigen::DenseStorage; +static_assert(std::is_trivially_move_constructible::value, "DenseStorage not trivially_move_constructible"); +static_assert(std::is_trivially_move_assignable::value, "DenseStorage not trivially_move_assignable"); +#if !defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) +static_assert(std::is_trivially_copy_constructible::value, "DenseStorage not trivially_copy_constructible"); +static_assert(std::is_trivially_copy_assignable::value, "DenseStorage not trivially_copy_assignable"); +static_assert(std::is_trivially_copyable::value, "DenseStorage not trivially_copyable"); +#endif +#endif + template void dense_storage_copy(int rows, int cols) { -- GitLab From 1f6b1c1a1fba66dd2e09b674bbeeaf8bbd4f11f6 Mon Sep 17 00:00:00 2001 From: Dan Miller Date: Thu, 1 Jul 2021 14:54:12 +0000 Subject: [PATCH 056/266] Fix duplicate definitions on Mac (cherry picked from commit eb047759030558acf0764d5d2f913f4f84cf85a8) --- Eigen/src/Core/util/Meta.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index b6aaed138..2429ddad2 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -189,19 +189,17 @@ template<> struct make_unsigned { typedef unsigned int type; } template<> struct make_unsigned { typedef unsigned int type; }; template<> struct make_unsigned { typedef unsigned long type; }; template<> struct make_unsigned { typedef unsigned long type; }; -template<> struct make_unsigned { typedef unsigned long long type; }; -template<> struct make_unsigned { typedef unsigned long long type; }; #if EIGEN_COMP_MSVC template<> struct make_unsigned { typedef unsigned __int64 type; }; template<> struct make_unsigned { typedef unsigned __int64 type; }; #endif -// Some platforms define int64_t as long long even for C++03. In this case we -// are missing the definition for make_unsigned. If we just define it, we get -// duplicated definitions for platforms defining int64_t as signed long for -// C++03. We therefore add the specialization for C++03 long long for these -// platforms only. -#if EIGEN_OS_MAC +// Some platforms define int64_t as `long long` even for C++03, where +// `long long` is not guaranteed by the standard. In this case we are missing +// the definition for make_unsigned. If we just define it, we run into issues +// where `long long` doesn't exist in some compilers for C++03. We therefore add +// the specialization for these platforms only. +#if EIGEN_OS_MAC || EIGEN_COMP_MINGW template<> struct make_unsigned { typedef unsigned long long type; }; template<> struct make_unsigned { typedef unsigned long long type; }; #endif -- GitLab From b6db0134351e71b0302112f3ac5bacc988a3a64b Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 1 Jul 2021 13:41:04 -0700 Subject: [PATCH 057/266] Fix inverse nullptr/asan errors for LU. For empty or single-column matrices, the current `PartialPivLU` currently dereferences a `nullptr` or accesses memory out-of-bounds. Here we adjust the checks to avoid this. (cherry picked from commit 154f00e9eacaec5667215784c7601b55024e2f61) --- Eigen/src/LU/PartialPivLU.h | 7 ++++++- test/inverse.cpp | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 46ffdd320..34aed7249 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -504,8 +504,13 @@ struct partial_lu_impl template void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions) { + // Special-case of zero matrix. + if (lu.rows() == 0 || lu.cols() == 0) { + nb_transpositions = 0; + return; + } eigen_assert(lu.cols() == row_transpositions.size()); - eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); + eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); partial_lu_impl < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, diff --git a/test/inverse.cpp b/test/inverse.cpp index 99f9e0c9b..9cedfa1e1 100644 --- a/test/inverse.cpp +++ b/test/inverse.cpp @@ -135,6 +135,8 @@ EIGEN_DECLARE_TEST(inverse) CALL_SUBTEST_5( inverse(MatrixXf(s,s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) CALL_SUBTEST_5( inverse_zerosized() ); + CALL_SUBTEST_5( inverse(MatrixXf(0, 0)) ); + CALL_SUBTEST_5( inverse(MatrixXf(1, 1)) ); s = internal::random(25,100); CALL_SUBTEST_6( inverse(MatrixXcd(s,s)) ); -- GitLab From 8190739f12583666c137a83ef52f3504ea3059bb Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 1 Jul 2021 12:47:52 -0700 Subject: [PATCH 058/266] Fix compile issues for gcc 4.8. - Move constructors can only be defaulted as NOEXCEPT if all members have NOEXCEPT move constructors. - gcc 4.8 has some funny parsing bug in `a < b->c`, thinking `b-` is a template parameter. (cherry picked from commit 6035da5283f12f7e6a49cda0c21696c8e5a115b7) --- Eigen/src/Core/DenseStorage.h | 8 ++++---- unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index 8a4cbd4be..08ef6c530 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -220,7 +220,7 @@ template class DenseSt EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) } #else - EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) = default; + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default; #endif #if !EIGEN_HAS_CXX11 EIGEN_DEVICE_FUNC @@ -230,7 +230,7 @@ template class DenseSt return *this; } #else - EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) = default; + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default; #endif #if EIGEN_HAS_RVALUE_REFERENCES #if !EIGEN_HAS_CXX11 @@ -245,8 +245,8 @@ template class DenseSt return *this; } #else - EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT = default; - EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&& other) EIGEN_NOEXCEPT = default; + EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default; + EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default; #endif #endif EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index 3b2100ab0..d96303224 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -372,7 +372,7 @@ template struct ArgMaxTupleReducer EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { if (t.second < accum->second) { return; - } else if (t.second > accum->second || t.first < accum->first) { + } else if (t.second > accum->second || accum->first > t.first ) { *accum = t; } } @@ -400,7 +400,7 @@ template struct ArgMinTupleReducer EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { if (t.second > accum->second) { return; - } else if (t.second < accum->second || t.first < accum->first) { + } else if (t.second < accum->second || accum->first > t.first) { *accum = t; } } -- GitLab From eebde572d98bb349d1df29d1e5308ee9d413ab32 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 30 Jun 2021 23:05:04 +0000 Subject: [PATCH 059/266] Create the ability to disable the specialized gemm_pack_rhs in Eigen (only PPC) for TensorFlow (cherry picked from commit 91e99ec1e02100d07e35a7abb1b5c76707237219) --- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 4c5cf1762..454b36cd6 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -11,6 +11,10 @@ #ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H +#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK +#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1 +#endif + #include "MatrixProductCommon.h" // Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX @@ -2423,6 +2427,7 @@ void gemm_pack_lhs struct gemm_pack_rhs { @@ -2450,6 +2455,7 @@ void gemm_pack_rhs pack; pack(blockB, rhs, depth, cols, stride, offset); } +#endif template struct gemm_pack_lhs @@ -2478,6 +2484,7 @@ void gemm_pack_lhs pack; pack(blockA, lhs, depth, rows, stride, offset); } + template struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> { @@ -2506,6 +2513,7 @@ void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, pack(blockA, lhs, depth, rows, stride, offset); } +#if EIGEN_ALTIVEC_USE_CUSTOM_PACK template struct gemm_pack_rhs { @@ -2533,6 +2541,7 @@ void gemm_pack_rhs dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } +#endif template struct gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> -- GitLab From 05bab8139a97df5fc39a8005fbdef6ccc48f441d Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 2 Jul 2021 13:36:05 -0700 Subject: [PATCH 060/266] Fix breakage of conj_helper in conjunction with custom types introduced in !537. (cherry picked from commit 7b35638ddb99a0298c5d3450de506a8e8e0203d3) --- Eigen/src/Core/arch/Default/ConjHelper.h | 68 ++++++++++++++---------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h index 255daddc5..53830b5a2 100644 --- a/Eigen/src/Core/arch/Default/ConjHelper.h +++ b/Eigen/src/Core/arch/Default/ConjHelper.h @@ -57,48 +57,58 @@ template<> struct conj_if { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; } }; -// Generic implementation. +// Generic Implementation, assume scalars since the packet-version is +// specialized below. template -struct conj_helper -{ - typedef typename ScalarBinaryOpTraits::ReturnType ResultType; +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const - { return Eigen::internal::pmadd(conj_if().pconj(x), conj_if().pconj(y), c); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const + { return this->pmul(x, y) + c; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const - { return Eigen::internal::pmul(conj_if().pconj(x), conj_if().pconj(y)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsType& x, const RhsType& y) const + { return conj_if()(x) * conj_if()(y); } }; -template -struct conj_helper -{ - typedef typename ScalarBinaryOpTraits::ReturnType ResultType; +template +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const + { return this->pmul(x, y) + c; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const - { return Eigen::internal::pmadd(pconj(x), pconj(y), c); } // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const - { return pconj(Eigen::internal::pmul(x, y)); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsScalar& x, const RhsScalar& y) const + { return numext::conj(x * y); } }; -// Generic implementation for mixed products of complex scalar types. -template struct conj_helper, RealScalar, Conj,false> +// Implementation with equal type, use packet operations. +template +struct conj_helper { - typedef std::complex Scalar; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const - { return c + conj_if().pconj(x) * y; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const - { return conj_if().pconj(x) * y; } + typedef Packet ResultType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(conj_if().pconj(x), conj_if().pconj(y), c); } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return Eigen::internal::pmul(conj_if().pconj(x), conj_if().pconj(y)); } }; -template struct conj_helper, false,Conj> +template +struct conj_helper { - typedef std::complex Scalar; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const - { return x * conj_if().pconj(y); } + typedef Packet ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(pconj(x), pconj(y), c); } + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return pconj(Eigen::internal::pmul(x, y)); } }; } // namespace internal -- GitLab From 601814b5754fa467a665cfceec4f5cf059a71ddb Mon Sep 17 00:00:00 2001 From: Jonas Harsch Date: Fri, 2 Jul 2021 20:33:52 +0000 Subject: [PATCH 061/266] Don't crash when attempting to shuffle an empty tensor. (cherry picked from commit aab747021be5ed1a1e9667243d884eb72003599d) --- .../Eigen/CXX11/src/Tensor/TensorShuffling.h | 6 +- unsupported/test/cxx11_tensor_shuffling.cpp | 55 +++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index 0999815d7..e5e5efdee 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -142,7 +142,8 @@ struct TensorEvaluator, Device> m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor( + m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); } } else { m_unshuffledInputStrides[NumDims - 1] = 1; @@ -151,7 +152,8 @@ struct TensorEvaluator, Device> m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor( + m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); } } diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 2ec85d2d4..89a64c021 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -215,6 +215,59 @@ static void test_shuffle_unshuffle() } +template +static void test_empty_shuffling() +{ + Tensor tensor(2,3,0,7); + tensor.setRandom(); + array shuffles; + shuffles[0] = 0; + shuffles[1] = 1; + shuffles[2] = 2; + shuffles[3] = 3; + + Tensor no_shuffle; + no_shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2); + VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3); + VERIFY_IS_EQUAL(no_shuffle.dimension(2), 0); + VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 0; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); + } + } + } + } + + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + Tensor shuffle; + shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(shuffle.dimension(0), 0); + VERIFY_IS_EQUAL(shuffle.dimension(1), 7); + VERIFY_IS_EQUAL(shuffle.dimension(2), 3); + VERIFY_IS_EQUAL(shuffle.dimension(3), 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 0; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + } + } + } + } +} + + EIGEN_DECLARE_TEST(cxx11_tensor_shuffling) { CALL_SUBTEST(test_simple_shuffling()); @@ -225,4 +278,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_shuffling) CALL_SUBTEST(test_shuffling_as_value()); CALL_SUBTEST(test_shuffle_unshuffle()); CALL_SUBTEST(test_shuffle_unshuffle()); + CALL_SUBTEST(test_empty_shuffling()); + CALL_SUBTEST(test_empty_shuffling()); } -- GitLab From 84955d109f894ce4e80837b2fd97bde3d0410612 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 2 Jul 2021 21:23:15 -0700 Subject: [PATCH 062/266] Fix Tensor documentation page. The extra [TOC] tag is generating a huge floating duplicated table-of-contents, which obscures the majority of the page (see bottom of https://eigen.tuxfamily.org/dox/unsupported/eigen_tensors.html). Remove it. Also, headers do not support markup (see [doxygen bug](https://github.com/doxygen/doxygen/issues/7467)), so backticks like ``` ``` end up generating titles that looks like ``` Constructor Tensor ``` Removing backticks for now. To generate proper formatted headers, we must directly use html instead of markdown, i.e. ```

Constructor Tensor<double,2>

``` which is ugly. Fixes #2254. (cherry picked from commit f5a9873bbb5488bcba3e37f92b4ec09a8db76081) --- unsupported/Eigen/CXX11/src/Tensor/README.md | 156 +++++++++---------- 1 file changed, 77 insertions(+), 79 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index 9b6f14204..2f65b1b0e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -3,8 +3,6 @@ Tensors are multidimensional arrays of elements. Elements are typically scalars, but more complex types such as strings are also supported. -[TOC] - ## Tensor Classes You can manipulate a tensor with one of the following classes. They all are in @@ -21,7 +19,7 @@ matrix. Tensors of this class are resizable. For example, if you assign a tensor of a different size to a Tensor, that tensor is resized to match its new value. -#### Constructor `Tensor(size0, size1, ...)` +#### Constructor Tensor(size0, size1, ...) Constructor for a Tensor. The constructor must be passed `rank` integers indicating the sizes of the instance along each of the the `rank` @@ -34,7 +32,7 @@ dimensions. // Resize t_3d by assigning a tensor of different sizes, but same rank. t_3d = Tensor(3, 4, 3); -#### Constructor `Tensor(size_array)` +#### Constructor Tensor(size_array) Constructor where the sizes for the constructor are specified as an array of values instead of an explicitly list of parameters. The array type to use is @@ -45,7 +43,7 @@ from an initializer list. Tensor t_2d({5, 7}); -### Class `TensorFixedSize>` +### Class TensorFixedSize> Class to use for tensors of fixed size, where the size is known at compile time. Fixed sized tensors can provide very fast computations because all their @@ -57,7 +55,7 @@ tensor data is held onto the stack and does not cause heap allocation and free. // Create a 4 x 3 tensor of floats. TensorFixedSize> t_4x3; -### Class `TensorMap>` +### Class TensorMap> This is the class to use to create a tensor on top of memory allocated and owned by another part of your code. It allows to view any piece of allocated @@ -67,7 +65,7 @@ data are stored. A TensorMap is not resizable because it does not own the memory where its data are stored. -#### Constructor `TensorMap>(data, size0, size1, ...)` +#### Constructor TensorMap>(data, size0, size1, ...) Constructor for a Tensor. The constructor must be passed a pointer to the storage for the data, and "rank" size attributes. The storage has to be @@ -87,13 +85,13 @@ large enough to hold all the data. TensorMap> t_12(t_4x3.data(), 12); -#### Class `TensorRef` +#### Class TensorRef See Assigning to a TensorRef below. ## Accessing Tensor Elements -#### ` tensor(index0, index1...)` +#### tensor(index0, index1...) Return the element at position `(index0, index1...)` in tensor `tensor`. You must pass as many parameters as the rank of `tensor`. @@ -278,7 +276,7 @@ Simiarly, assigning an expression to a TensorMap causes its evaluation. Like tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to have the rank and sizes of the expression that are assigned to them. -#### Calling `eval()`. +#### Calling eval(). When you compute large composite expressions, you sometimes want to tell Eigen that an intermediate value in the expression tree is worth evaluating ahead of @@ -355,7 +353,7 @@ call for the right hand side: (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); -#### Assigning to a `TensorRef`. +#### Assigning to a TensorRef. If you need to access only a few elements from the value of an expression you can avoid materializing the value in a full tensor by using a TensorRef. @@ -455,24 +453,24 @@ memory for tensors with cuda. In the documentation of the tensor methods and Operation we mention datatypes that are tensor-type specific: -#### `::``Dimensions` +#### ::Dimensions Acts like an array of ints. Has an `int size` attribute, and can be indexed like an array to access individual values. Used to represent the dimensions of a tensor. See `dimensions()`. -#### `::``Index` +#### ::Index Acts like an `int`. Used for indexing tensors along their dimensions. See `operator()`, `dimension()`, and `size()`. -#### `::``Scalar` +#### ::Scalar Represents the datatype of individual tensor elements. For example, for a `Tensor`, `Scalar` is the type `float`. See `setConstant()`. -#### `` +#### We use this pseudo type to indicate that a tensor Operation is returned by a method. We indicate in the text the type and dimensions of the tensor that the @@ -492,7 +490,7 @@ Tensor, TensorFixedSize, and TensorMap. ## Metadata -### `int NumDimensions` +### int NumDimensions Constant value indicating the number of dimensions of a Tensor. This is also known as the tensor "rank". @@ -501,7 +499,7 @@ known as the tensor "rank". cout << "Dims " << a.NumDimensions; => Dims 2 -### `Dimensions dimensions()` +### Dimensions dimensions() Returns an array-like object representing the dimensions of the tensor. The actual type of the `dimensions()` result is `::``Dimensions`. @@ -519,7 +517,7 @@ If you use a C++11 compiler, you can use `auto` to simplify the code: << ", dim 1: " << d[1]; => Dim size: 2, dim 0: 3, dim 1: 4 -### `Index dimension(Index n)` +### Index dimension(Index n) Returns the n-th dimension of the tensor. The actual type of the `dimension()` result is `::``Index`, but you can @@ -530,7 +528,7 @@ always use it like an int. cout << "Dim 1: " << dim1; => Dim 1: 4 -### `Index size()` +### Index size() Returns the total number of elements in the tensor. This is the product of all the tensor dimensions. The actual type of the `size()` result is @@ -605,7 +603,7 @@ You can use one of the methods below to initialize the tensor memory. These have an immediate effect on the tensor and return the tensor itself as a result. These are not tensor Operations which delay evaluation. -### ` setConstant(const Scalar& val)` +### setConstant(const Scalar& val) Sets all elements of the tensor to the constant value `val`. `Scalar` is the type of data stored in the tensor. You can pass any value that is @@ -633,7 +631,7 @@ has a copy constructor and an `operator=()`: yolo yolo yolo -### ` setZero()` +### setZero() Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`. Returns the tensor itself in case you want to chain another call. @@ -647,7 +645,7 @@ Returns the tensor itself in case you want to chain another call. 0 0 0 0 -### ` setValues({..initializer_list})` +### setValues({..initializer_list}) Fills the tensor with explicit values specified in a std::initializer_list. The type of the initializer list depends on the type and rank of the tensor. @@ -683,7 +681,7 @@ code only sets the values of the first row of the tensor. 10 20 30 1000 1000 1000 -### ` setRandom()` +### setRandom() Fills the tensor with random values. Returns the tensor itself in case you want to chain another call. @@ -750,7 +748,7 @@ values of a tensor expression, the expression must either be evaluated or wrapped in a TensorRef. -### `Scalar* data()` and `const Scalar* data() const` +### Scalar* data() and const Scalar* data() const Returns a pointer to the storage for the tensor. The pointer is const if the tensor was const. This allows direct access to the data. The layout of the @@ -778,7 +776,7 @@ The chain of Operation is evaluated lazily, typically when it is assigned to a tensor. See "Controlling when Expression are Evaluated" for more details about their evaluation. -### ` constant(const Scalar& val)` +### constant(const Scalar& val) Returns a tensor of the same type and dimensions as the original tensor but where all elements have the value `val`. @@ -806,7 +804,7 @@ tensor, or multiply every element of a tensor by a scalar. 0.6 0.6 0.6 0.6 0.6 0.6 -### ` random()` +### random() Returns a tensor of the same type and dimensions as the current tensor but where all elements have random values. @@ -836,7 +834,7 @@ All these operations take a single input tensor as argument and return a tensor of the same type and dimensions as the tensor to which they are applied. The requested operations are applied to each element independently. -### ` operator-()` +### operator-() Returns a tensor of the same type and dimensions as the original tensor containing the opposite values of the original tensor. @@ -855,42 +853,42 @@ containing the opposite values of the original tensor. -1 -1 -1 -1 -1 -1 -### ` sqrt()` +### sqrt() Returns a tensor of the same type and dimensions as the original tensor containing the square roots of the original tensor. -### ` rsqrt()` +### rsqrt() Returns a tensor of the same type and dimensions as the original tensor containing the inverse square roots of the original tensor. -### ` square()` +### square() Returns a tensor of the same type and dimensions as the original tensor containing the squares of the original tensor values. -### ` inverse()` +### inverse() Returns a tensor of the same type and dimensions as the original tensor containing the inverse of the original tensor values. -### ` exp()` +### exp() Returns a tensor of the same type and dimensions as the original tensor containing the exponential of the original tensor. -### ` log()` +### log() Returns a tensor of the same type and dimensions as the original tensor containing the natural logarithms of the original tensor. -### ` abs()` +### abs() Returns a tensor of the same type and dimensions as the original tensor containing the absolute values of the original tensor. -### ` pow(Scalar exponent)` +### pow(Scalar exponent) Returns a tensor of the same type and dimensions as the original tensor containing the coefficients of the original tensor to the power of the @@ -917,17 +915,17 @@ cubic roots of an int Tensor: 0 1 2 3 4 5 -### ` operator * (Scalar scale)` +### operator * (Scalar scale) Multiplies all the coefficients of the input tensor by the provided scale. -### ` cwiseMax(Scalar threshold)` +### cwiseMax(Scalar threshold) TODO -### ` cwiseMin(Scalar threshold)` +### cwiseMin(Scalar threshold) TODO -### ` unaryExpr(const CustomUnaryOp& func)` +### unaryExpr(const CustomUnaryOp& func) TODO @@ -939,39 +937,39 @@ dimensions as the tensors to which they are applied, and unless otherwise specified it is also of the same type. The requested operations are applied to each pair of elements independently. -### ` operator+(const OtherDerived& other)` +### operator+(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise sums of the inputs. -### ` operator-(const OtherDerived& other)` +### operator-(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise differences of the inputs. -### ` operator*(const OtherDerived& other)` +### operator*(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise products of the inputs. -### ` operator/(const OtherDerived& other)` +### operator/(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise quotients of the inputs. This operator is not supported for integer types. -### ` cwiseMax(const OtherDerived& other)` +### cwiseMax(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise maximums of the inputs. -### ` cwiseMin(const OtherDerived& other)` +### cwiseMin(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise mimimums of the inputs. -### ` Logical operators` +### Logical operators The following logical operators are supported as well: @@ -1129,50 +1127,50 @@ scalar, represented as a zero-dimension tensor. 276 -### ` sum(const Dimensions& new_dims)` -### ` sum()` +### sum(const Dimensions& new_dims) +### sum() Reduce a tensor using the sum() operator. The resulting values are the sum of the reduced values. -### ` mean(const Dimensions& new_dims)` -### ` mean()` +### mean(const Dimensions& new_dims) +### mean() Reduce a tensor using the mean() operator. The resulting values are the mean of the reduced values. -### ` maximum(const Dimensions& new_dims)` -### ` maximum()` +### maximum(const Dimensions& new_dims) +### maximum() Reduce a tensor using the maximum() operator. The resulting values are the largest of the reduced values. -### ` minimum(const Dimensions& new_dims)` -### ` minimum()` +### minimum(const Dimensions& new_dims) +### minimum() Reduce a tensor using the minimum() operator. The resulting values are the smallest of the reduced values. -### ` prod(const Dimensions& new_dims)` -### ` prod()` +### prod(const Dimensions& new_dims) +### prod() Reduce a tensor using the prod() operator. The resulting values are the product of the reduced values. -### ` all(const Dimensions& new_dims)` -### ` all()` +### all(const Dimensions& new_dims) +### all() Reduce a tensor using the all() operator. Casts tensor to bool and then checks whether all elements are true. Runs through all elements rather than short-circuiting, so may be significantly inefficient. -### ` any(const Dimensions& new_dims)` -### ` any()` +### any(const Dimensions& new_dims) +### any() Reduce a tensor using the any() operator. Casts tensor to bool and then checks whether any element is true. Runs through all elements rather than short-circuiting, so may be significantly inefficient. -### ` reduce(const Dimensions& new_dims, const Reducer& reducer)` +### reduce(const Dimensions& new_dims, const Reducer& reducer) Reduce a tensor using a user-defined reduction operator. See `SumReducer` in TensorFunctors.h for information on how to implement a reduction operator. @@ -1208,8 +1206,8 @@ Example: Trace along 2 dimensions. 15 -### ` trace(const Dimensions& new_dims)` -### ` trace()` +### trace(const Dimensions& new_dims) +### trace() As a special case, if no parameter is passed to the operation, trace is computed along *all* dimensions of the input tensor. @@ -1259,18 +1257,18 @@ dd a comment to this line 1 3 6 4 9 15 -### ` cumsum(const Index& axis)` +### cumsum(const Index& axis) Perform a scan by summing consecutive entries. -### ` cumprod(const Index& axis)` +### cumprod(const Index& axis) Perform a scan by multiplying consecutive entries. ## Convolutions -### ` convolve(const Kernel& kernel, const Dimensions& dims)` +### convolve(const Kernel& kernel, const Dimensions& dims) Returns a tensor that is the output of the convolution of the input tensor with the kernel, along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor @@ -1313,7 +1311,7 @@ These operations return a Tensor with different dimensions than the original Tensor. They can be used to access slices of tensors, see them with different dimensions, or pad tensors with additional data. -### ` reshape(const Dimensions& new_dims)` +### reshape(const Dimensions& new_dims) Returns a view of the input tensor that has been reshaped to the specified new dimensions. The argument new_dims is an array of Index values. The @@ -1392,7 +1390,7 @@ Note that "b" itself was not reshaped but that instead the assignment is done to the reshape view of b. -### ` shuffle(const Shuffle& shuffle)` +### shuffle(const Shuffle& shuffle) Returns a copy of the input tensor whose dimensions have been reordered according to the specified permutation. The argument shuffle @@ -1433,7 +1431,7 @@ Let's rewrite the previous example to take advantage of this feature: output.shuffle({2, 0, 1}) = input; -### ` stride(const Strides& strides)` +### stride(const Strides& strides) Returns a view of the input tensor that strides (skips stride-1 elements) along each of the dimensions. The argument strides is an @@ -1459,7 +1457,7 @@ It is possible to assign a tensor to a stride: output.stride({2, 3, 4}) = input; -### ` slice(const StartIndices& offsets, const Sizes& extents)` +### slice(const StartIndices& offsets, const Sizes& extents) Returns a sub-tensor of the given tensor. For each dimension i, the slice is made of the coefficients stored between offset[i] and offset[i] + extents[i] in @@ -1485,7 +1483,7 @@ the input tensor. 600 700 -### ` chip(const Index offset, const Index dim)` +### chip(const Index offset, const Index dim) A chip is a special kind of slice. It is the subtensor at the given offset in the dimension dim. The returned tensor has one fewer dimension than the input @@ -1536,7 +1534,7 @@ lvalue. For example: 0 0 0 -### ` reverse(const ReverseDimensions& reverse)` +### reverse(const ReverseDimensions& reverse) Returns a view of the input tensor that reverses the order of the coefficients along a subset of the dimensions. The argument reverse is an array of boolean @@ -1566,7 +1564,7 @@ of a 2D tensor: 0 100 200 -### ` broadcast(const Broadcast& broadcast)` +### broadcast(const Broadcast& broadcast) Returns a view of the input tensor in which the input is replicated one to many times. @@ -1590,11 +1588,11 @@ made in each of the dimensions. 0 100 200 0 100 200 300 400 500 300 400 500 -### ` concatenate(const OtherDerived& other, Axis axis)` +### concatenate(const OtherDerived& other, Axis axis) TODO -### ` pad(const PaddingDimensions& padding)` +### pad(const PaddingDimensions& padding) Returns a view of the input tensor in which the input is padded with zeros. @@ -1619,7 +1617,7 @@ Returns a view of the input tensor in which the input is padded with zeros. 0 0 0 0 -### ` extract_patches(const PatchDims& patch_dims)` +### extract_patches(const PatchDims& patch_dims) Returns a tensor of coefficient patches extracted from the input tensor, where each patch is of dimension specified by 'patch_dims'. The returned tensor has @@ -1706,7 +1704,7 @@ This code results in the following output when the data layout is RowMajor: 6 7 10 11 -### ` extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)` +### extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type) Returns a tensor of coefficient image patches extracted from the input tensor, which is expected to have dimensions ordered as follows (depending on the data @@ -1763,7 +1761,7 @@ sizes: ## Special Operations -### ` cast()` +### cast() Returns a tensor of type T with the same dimensions as the original tensor. The returned tensor contains the values of the original tensor converted to @@ -1792,7 +1790,7 @@ but you can easily cast the tensors to floats to do the division: 1 2 2 -### ` eval()` +### eval() TODO -- GitLab From 7571704a43f53ac7b159e8a9beeacc57a2015ee6 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 2 Jul 2021 13:03:30 -0700 Subject: [PATCH 063/266] Fix CMake directory issues. Allows absolute and relative paths for - `INCLUDE_INSTALL_DIR` - `CMAKEPACKAGE_INSTALL_DIR` - `PKGCONFIG_INSTALL_DIR` Type should be `PATH` not `STRING`. Contrary to !211, these don't seem to be made absolute if user-defined - according to the doc any directories should use `PATH` type, which allows a file dialog to be used via the GUI. It also better handles file separators. If user provides an absolute path, it will be made relative to `CMAKE_INSTALL_PREFIX` so that the `configure_packet_config_file` will work. Fixes #2155 and #2269. (cherry picked from commit f44f05532decf830fcdb07e2a67a2fa4ccbc3870) --- CMakeLists.txt | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ba7d3872..bd1af32b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -424,25 +424,26 @@ endif() if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR) set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR} - CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed") + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed") else() set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}/eigen3" - CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed" + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed" ) endif() set(CMAKEPACKAGE_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/eigen3/cmake" - CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed" + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen3Config.cmake is installed" ) set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/pkgconfig" - CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed" + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where eigen3.pc is installed" ) foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR) + # If an absolute path is specified, make it relative to "{CMAKE_INSTALL_PREFIX}". if(IS_ABSOLUTE "${${var}}") - message(FATAL_ERROR "${var} must be relative to CMAKE_PREFIX_PATH. Got: ${${var}}") + file(RELATIVE_PATH "${var}" "${CMAKE_INSTALL_PREFIX}" "${${var}}") endif() endforeach() -- GitLab From 69ec4907daef477a4779eee5722ad8b554f486f2 Mon Sep 17 00:00:00 2001 From: Guoqiang QI Date: Thu, 8 Jul 2021 17:05:26 +0000 Subject: [PATCH 064/266] Make a copy of input matrix when try to do the inverse in place, this fixes #2285. (cherry picked from commit 4bcd42c271761dc5341f8e08ca7d357c3614cb01) --- Eigen/src/LU/InverseImpl.h | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index 1bab00c01..27e263945 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -77,10 +77,11 @@ inline void compute_inverse_size2_helper( const MatrixType& matrix, const typename ResultType::Scalar& invdet, ResultType& result) { + typename ResultType::Scalar temp = matrix.coeff(0,0); result.coeffRef(0,0) = matrix.coeff(1,1) * invdet; result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet; result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet; - result.coeffRef(1,1) = matrix.coeff(0,0) * invdet; + result.coeffRef(1,1) = temp * invdet; } template @@ -165,7 +166,12 @@ struct compute_inverse cofactors_col0.coeffRef(2) = cofactor_3x3(matrix); const Scalar det = (cofactors_col0.cwiseProduct(matrix.col(0))).sum(); const Scalar invdet = Scalar(1) / det; - compute_inverse_size3_helper(matrix, invdet, cofactors_col0, result); + if(extract_data(matrix) != extract_data(result)) { + compute_inverse_size3_helper(matrix, invdet, cofactors_col0, result); + } else { + MatrixType matrix_t = matrix; + compute_inverse_size3_helper(matrix_t, invdet, cofactors_col0, result); + } } }; @@ -191,7 +197,12 @@ struct compute_inverse_and_det_with_check invertible = abs(determinant) > absDeterminantThreshold; if(!invertible) return; const Scalar invdet = Scalar(1) / determinant; - compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse); + if(extract_data(matrix) != extract_data(inverse)) { + compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse); + } else { + MatrixType matrix_t = matrix; + compute_inverse_size3_helper(matrix_t, invdet, cofactors_col0, inverse); + } } }; @@ -273,7 +284,13 @@ struct compute_inverse_and_det_with_check using std::abs; determinant = matrix.determinant(); invertible = abs(determinant) > absDeterminantThreshold; - if(invertible) compute_inverse::run(matrix, inverse); + if(invertible && extract_data(matrix) != extract_data(inverse)) { + compute_inverse::run(matrix, inverse); + } + else if(invertible) { + MatrixType matrix_t = matrix; + compute_inverse::run(matrix_t, inverse); + } } }; @@ -347,6 +364,8 @@ inline const Inverse MatrixBase::inverse() const * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param determinant Reference to the variable in which to store the determinant. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. @@ -387,6 +406,8 @@ inline void MatrixBase::computeInverseAndDetWithCheck( * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. * \param absDeterminantThreshold Optional parameter controlling the invertibility check. -- GitLab From 5a3c9eddb48cf087c1b6c2a1ac39e9f3623e6cc8 Mon Sep 17 00:00:00 2001 From: Jonas Harsch Date: Thu, 8 Jul 2021 18:02:58 +0000 Subject: [PATCH 065/266] Removed superfluous boolean `degenerate` in TensorMorphing.h. (cherry picked from commit e9c9a3130b7307a240335aa527a6d4c5fb2ee471) --- .../Eigen/CXX11/src/Tensor/TensorMorphing.h | 26 +++++-------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index ea97cf185..b3f00f77a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -446,12 +446,7 @@ struct TensorEvaluator, Devi EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { - for (Index i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); - } - m_is_identity = true; - bool degenerate = false; for (int i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); @@ -459,9 +454,6 @@ struct TensorEvaluator, Devi op.startIndices()[i] != 0) { m_is_identity = false; } - if (op.sizes()[i] == 0) { // we have an empty size - degenerate = true; - } } // No strides for scalars. @@ -479,8 +471,8 @@ struct TensorEvaluator, Devi m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); } + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } } else { m_inputStrides[NumDims-1] = 1; for (int i = NumDims - 2; i >= 0; --i) { @@ -491,8 +483,8 @@ struct TensorEvaluator, Devi m_outputStrides[NumDims-1] = 1; for (int i = NumDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); } + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } } } @@ -933,14 +925,12 @@ struct TensorEvaluator::Dimensions InputDimensions; const InputDimensions& input_dims = m_impl.dimensions(); - // check for degenerate intervals and compute output tensor shape - bool degenerate = false; + // compute output tensor shape m_is_identity = true; for (int i = 0; i < NumDims; i++) { Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) { m_dimensions[i] = 0; - degenerate = true; } else { m_dimensions[i] = (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0); @@ -967,8 +957,7 @@ struct TensorEvaluator(degenerate ? 1 : m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } else { m_inputStrides[NumDims-1] = m_strides[NumDims-1]; @@ -983,8 +972,7 @@ struct TensorEvaluator= 0; --i) { m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } } -- GitLab From 56966fd2e616a67e27724fa614d90f10286f32aa Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 9 Jul 2021 03:59:35 +0000 Subject: [PATCH 066/266] Defer to std::fill_n when filling a dense object with a constant value. (cherry picked from commit 0c361c4899c9042d2b25cd60d7826ab464caacb7) --- Eigen/src/Core/AssignEvaluator.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index f8c87d0fc..7d76f0c25 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -785,6 +785,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType dense_assignment_loop::run(kernel); } +// Specialization for filling the destination with a constant value. +#ifndef EIGEN_GPU_COMPILE_PHASE +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp, DstXprType>& src, const internal::assign_op& func) +{ + resize_if_allowed(dst, src, func); + std::fill_n(dst.data(), dst.size(), src.functor()()); +} +#endif + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) { -- GitLab From 930696fc53599748d095bf81987eb36219599414 Mon Sep 17 00:00:00 2001 From: Rohit Santhanam Date: Fri, 9 Jul 2021 14:58:07 +0000 Subject: [PATCH 067/266] Enable extract et. al. for HIP GPU. (cherry picked from commit beea14a18f76817439b4d8901d29db2e9c4a24c8) --- Eigen/src/Core/util/BlasUtil.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index 44206326c..e16a56498 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -518,7 +518,7 @@ struct blas_traits template::HasUsableDirectAccess> struct extract_data_selector { - static const typename T::Scalar* run(const T& m) + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) { return blas_traits::extract(m).data(); } @@ -529,7 +529,8 @@ struct extract_data_selector { static typename T::Scalar* run(const T&) { return 0; } }; -template const typename T::Scalar* extract_data(const T& m) +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) { return extract_data_selector::run(m); } -- GitLab From 5d37114fc0741f11781751bea72bec9511da344c Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 20 Jul 2021 11:40:17 -0700 Subject: [PATCH 068/266] Fix explicit default cache size typo. (cherry picked from commit 297f0f563d916260665d7fadc017f94f1a5e7a03) --- Eigen/src/Core/products/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 1116321a9..f35b760c1 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -44,7 +44,7 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE) #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE) -#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_SET_DEFAULT_L3_CACHE_SIZE +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE #else #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE) -- GitLab From 9c90d5d832bc04c2e3a0a48f052fae7c6eb136a5 Mon Sep 17 00:00:00 2001 From: arthurfeeney Date: Sat, 17 Jul 2021 10:39:38 -0500 Subject: [PATCH 069/266] Fixes #1387 for compilation error in JacobiSVD with HouseholderQRPreconditioner that occurs when input is a compile-time row vector. (cherry picked from commit a77638387dd1aa2d07d2dae240cc30b303b4ef38) --- Eigen/src/SVD/JacobiSVD.h | 24 ++++++++++++------------ test/jacobisvd.cpp | 3 +++ test/svd_common.h | 15 ++++++++++++--- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 8551a06c6..9d95acdf6 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -112,12 +112,12 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - TrOptions = RowsAtCompileTime==1 ? (int(MatrixType::Options) & ~(int(RowMajor))) - : ColsAtCompileTime==1 ? (int(MatrixType::Options) | int(RowMajor)) - : MatrixType::Options + Options = MatrixType::Options }; - typedef Matrix - TransposeTypeWithSameStorageOrder; + + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -202,13 +202,12 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - TrOptions = RowsAtCompileTime==1 ? (int(MatrixType::Options) & ~(int(RowMajor))) - : ColsAtCompileTime==1 ? (int(MatrixType::Options) | int(RowMajor)) - : MatrixType::Options + Options = MatrixType::Options }; - typedef Matrix - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -303,8 +302,9 @@ public: Options = MatrixType::Options }; - typedef Matrix - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp index 89484d971..5b15c5a27 100644 --- a/test/jacobisvd.cpp +++ b/test/jacobisvd.cpp @@ -36,6 +36,9 @@ void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true) template void jacobisvd_verify_assert(const MatrixType& m) { svd_verify_assert >(m); + svd_verify_assert >(m, true); + svd_verify_assert >(m); + svd_verify_assert >(m); Index rows = m.rows(); Index cols = m.cols(); diff --git a/test/svd_common.h b/test/svd_common.h index bd62edcc8..eae4c0bfe 100644 --- a/test/svd_common.h +++ b/test/svd_common.h @@ -462,7 +462,7 @@ void svd_preallocate() } template -void svd_verify_assert(const MatrixType& m) +void svd_verify_assert(const MatrixType& m, bool fullOnly = false) { typedef typename MatrixType::Scalar Scalar; Index rows = m.rows(); @@ -489,8 +489,17 @@ void svd_verify_assert(const MatrixType& m) VERIFY_RAISES_ASSERT(svd.matrixV()) svd.singularValues(); VERIFY_RAISES_ASSERT(svd.solve(rhs)) - - if (ColsAtCompileTime == Dynamic) + + svd.compute(a, ComputeFullU); + svd.matrixU(); + VERIFY_RAISES_ASSERT(svd.matrixV()) + VERIFY_RAISES_ASSERT(svd.solve(rhs)) + svd.compute(a, ComputeFullV); + svd.matrixV(); + VERIFY_RAISES_ASSERT(svd.matrixU()) + VERIFY_RAISES_ASSERT(svd.solve(rhs)) + + if (!fullOnly && ColsAtCompileTime == Dynamic) { svd.compute(a, ComputeThinU); svd.matrixU(); -- GitLab From 5ccb72b2e49435488b57bb171d46afc180ff4bab Mon Sep 17 00:00:00 2001 From: Jonas Harsch Date: Mon, 26 Jul 2021 07:20:19 +0000 Subject: [PATCH 070/266] Fixed typo in TutorialSparse.dox (cherry picked from commit 5b81764c0f4e06ff12a0c769b1bd876b10ad7502) --- doc/TutorialSparse.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox index 350ea1139..c69171ec5 100644 --- a/doc/TutorialSparse.dox +++ b/doc/TutorialSparse.dox @@ -60,7 +60,7 @@ On the other hand, inserting elements with increasing inner indices in a given i The case where no empty space is available is a special case, and is referred as the \em compressed mode. It corresponds to the widely used Compressed Column (or Row) Storage schemes (CCS or CRS). Any SparseMatrix can be turned to this form by calling the SparseMatrix::makeCompressed() function. -In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j]. +In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we have the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j]. Therefore, in practice a call to SparseMatrix::makeCompressed() frees this buffer. It is worth noting that most of our wrappers to external libraries requires compressed matrices as inputs. -- GitLab From c334eece4413b1542c40c1c06336f42c2aeb3e76 Mon Sep 17 00:00:00 2001 From: Alexander Karatarakis Date: Thu, 29 Jul 2021 18:02:04 +0000 Subject: [PATCH 071/266] _DerType -> DerivativeType as underscore-followed-by-caps is a reserved identifier (cherry picked from commit f357283d3128a6253af09705155ce4f9f113e3c8) --- .../Eigen/src/AutoDiff/AutoDiffScalar.h | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index 0ef159e30..0f166e35f 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -26,11 +26,11 @@ void make_coherent(const A& a, const B&b) make_coherent_impl::run(a.const_cast_derived(), b.const_cast_derived()); } -template struct auto_diff_special_op; +template struct auto_diff_special_op; } // end namespace internal -template class AutoDiffScalar; +template class AutoDiffScalar; template inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) { @@ -38,16 +38,16 @@ inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType:: } /** \class AutoDiffScalar - * \brief A scalar type replacement with automatic differentation capability + * \brief A scalar type replacement with automatic differentiation capability * - * \param _DerType the vector type used to store/represent the derivatives. The base scalar type + * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type * as well as the number of derivatives to compute are determined from this type. * Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf * if the number of derivatives is not known at compile time, and/or, the number * of derivatives is large. - * Note that _DerType can also be a reference (e.g., \c VectorXf&) to wrap a + * Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a * existing vector into an AutoDiffScalar. - * Finally, _DerType can also be any Eigen compatible expression. + * Finally, DerivativeType can also be any Eigen compatible expression. * * This class represents a scalar value while tracking its respective derivatives using Eigen's expression * template mechanism. @@ -63,17 +63,17 @@ inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType:: * */ -template +template class AutoDiffScalar : public internal::auto_diff_special_op - <_DerType, !internal::is_same::type>::Scalar, - typename NumTraits::type>::Scalar>::Real>::value> + ::type>::Scalar, + typename NumTraits::type>::Scalar>::Real>::value> { public: typedef internal::auto_diff_special_op - <_DerType, !internal::is_same::type>::Scalar, - typename NumTraits::type>::Scalar>::Real>::value> Base; - typedef typename internal::remove_all<_DerType>::type DerType; + ::type>::Scalar, + typename NumTraits::type>::Scalar>::Real>::value> Base; + typedef typename internal::remove_all::type DerType; typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real Real; @@ -382,16 +382,16 @@ class AutoDiffScalar namespace internal { -template -struct auto_diff_special_op<_DerType, true> -// : auto_diff_scalar_op<_DerType, typename NumTraits::Real, +template +struct auto_diff_special_op +// : auto_diff_scalar_op::Real, // is_same::Real>::value> { - typedef typename remove_all<_DerType>::type DerType; + typedef typename remove_all::type DerType; typedef typename traits::Scalar Scalar; typedef typename NumTraits::Real Real; -// typedef auto_diff_scalar_op<_DerType, typename NumTraits::Real, +// typedef auto_diff_scalar_op::Real, // is_same::Real>::value> Base; // using Base::operator+; @@ -401,8 +401,8 @@ struct auto_diff_special_op<_DerType, true> // using Base::operator*; // using Base::operator*=; - const AutoDiffScalar<_DerType>& derived() const { return *static_cast*>(this); } - AutoDiffScalar<_DerType>& derived() { return *static_cast*>(this); } + const AutoDiffScalar& derived() const { return *static_cast*>(this); } + AutoDiffScalar& derived() { return *static_cast*>(this); } inline const AutoDiffScalar operator+(const Real& other) const @@ -410,12 +410,12 @@ struct auto_diff_special_op<_DerType, true> return AutoDiffScalar(derived().value() + other, derived().derivatives()); } - friend inline const AutoDiffScalar operator+(const Real& a, const AutoDiffScalar<_DerType>& b) + friend inline const AutoDiffScalar operator+(const Real& a, const AutoDiffScalar& b) { return AutoDiffScalar(a + b.value(), b.derivatives()); } - inline AutoDiffScalar<_DerType>& operator+=(const Real& other) + inline AutoDiffScalar& operator+=(const Real& other) { derived().value() += other; return derived(); @@ -431,22 +431,22 @@ struct auto_diff_special_op<_DerType, true> } friend inline const AutoDiffScalar >, DerType>::Type > - operator*(const Real& other, const AutoDiffScalar<_DerType>& a) + operator*(const Real& other, const AutoDiffScalar& a) { return AutoDiffScalar >, DerType>::Type >( a.value() * other, a.derivatives() * other); } - inline AutoDiffScalar<_DerType>& operator*=(const Scalar& other) + inline AutoDiffScalar& operator*=(const Scalar& other) { *this = *this * other; return derived(); } }; -template -struct auto_diff_special_op<_DerType, false> +template +struct auto_diff_special_op { void operator*() const; void operator-() const; -- GitLab From c0c7b695cd588e1694369dfc00f540f914434ed2 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 21 Jul 2021 12:41:29 -0700 Subject: [PATCH 072/266] Fix assignment operator issue for latest MSVC+NVCC. Details are scattered across #920, #1000, #1324, #2291. Summary: some MSVC versions have a bug that requires omitting explicit `operator=` definitions (leads to duplicate definition errors), and some MSVC versions require adding explicit `operator=` definitions (otherwise implicitly deleted errors). This mess tries to cover all the cases encountered. Fixes #2291. (cherry picked from commit 9816fe59b47dc4c07967b5ee93a8e8aaa6e9c308) --- Eigen/src/Core/util/Macros.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index e5960d073..518c6c193 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1185,8 +1185,12 @@ namespace Eigen { #define EIGEN_USING_STD(FUNC) using std::FUNC; #endif -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_COMP_NVCC) - // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC)) + // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary, + // otherwise we get duplicate definition errors + // For later MSVC versions, we require explicit operator= definition, otherwise we get + // use of implicitly deleted operator errors. + // (cf Bugs 920, 1000, 1324, 2291) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) -- GitLab From 7adc1545b457ccf3e9316a8b81d676bd5b1f9834 Mon Sep 17 00:00:00 2001 From: hyunggi-sv Date: Tue, 3 Aug 2021 09:40:33 +0900 Subject: [PATCH 073/266] fix:typo in dox (has->have) (cherry picked from commit 02a0e79c701da7aa8dfad79b13cd1e7fae46d634) --- Eigen/src/Core/util/Constants.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index f7f907ab7..35dcaa7b3 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -157,7 +157,7 @@ const unsigned int DirectAccessBit = 0x40; /** \deprecated \ingroup flags * * means the first coefficient packet is guaranteed to be aligned. - * An expression cannot has the AlignedBit without the PacketAccessBit flag. + * An expression cannot have the AlignedBit without the PacketAccessBit flag. * In other words, this means we are allow to perform an aligned packet access to the first element regardless * of the expression kind: * \code -- GitLab From 3dc42eeaecdeb71932d9a8a2e3c8333d262eb71d Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 20 Jul 2021 13:53:41 -0700 Subject: [PATCH 074/266] Enable equality comparisons on GPU. Since `std::equal_to::operator()` is not a device function, it fails on GPU. On my device, I seem to get a silent crash in the kernel (no reported error, but the kernel does not complete). Replacing this with a portable version enables comparisons on device. Addresses #2292 - would need to be cherry-picked. The 3.3 branch also requires adding `EIGEN_DEVICE_FUNC` in `BooleanRedux.h` to get fully working. (cherry picked from commit 7880f10526a11dc5544426c54c5763de576bf285) --- Eigen/src/Core/functors/StlFunctors.h | 30 ++++++++++++++++++++++++ Eigen/src/plugins/MatrixCwiseBinaryOps.h | 8 +++---- test/gpu_basic.cu | 15 ++++++------ 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h index d2e7b5b03..4570c9b63 100644 --- a/Eigen/src/Core/functors/StlFunctors.h +++ b/Eigen/src/Core/functors/StlFunctors.h @@ -12,6 +12,28 @@ namespace Eigen { +// Portable replacements for certain functors. +namespace numext { + +template +struct equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs == rhs; + } +}; + +template +struct not_equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs != rhs; + } +}; + +} + + namespace internal { // default functor traits for STL functors: @@ -68,10 +90,18 @@ template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; +template +struct functor_traits > + : functor_traits > {}; + template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; +template +struct functor_traits > + : functor_traits > {}; + #if (EIGEN_COMP_CXXVER < 11) // std::binder* are deprecated since c++11 and will be removed in c++17 template diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h index f1084abef..a0feef871 100644 --- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h +++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h @@ -39,10 +39,10 @@ cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise != operator of *this and \a other @@ -59,10 +59,10 @@ cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise min of *this and \a other diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu index bf8dcacde..4298da3bb 100644 --- a/test/gpu_basic.cu +++ b/test/gpu_basic.cu @@ -197,18 +197,17 @@ struct complex_operators { res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array(); block_idx += size; - // Equality comparisons currently not functional on device - // (std::equal_to is host-only). - // const T true_vector = T::Constant(true_value); - // const T false_vector = T::Constant(false_value); - // res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector); - // block_idx += size; + const T true_vector = T::Constant(true_value); + const T false_vector = T::Constant(false_value); + res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector); + block_idx += size; + // Mixing types in equality comparison does not work. // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector); // block_idx += size; // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector); // block_idx += size; - // res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector); - // block_idx += size; + res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector); + block_idx += size; // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector); // block_idx += size; // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector); -- GitLab From 237c59a2aaa8533a6e777326aca5c09e29f7fef6 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 8 Jul 2021 20:32:23 -0700 Subject: [PATCH 075/266] Modify scalar pzero, ptrue, pselect, and p operations to avoid memset. The `memset` function and bitwise manipulation only apply to POD types that do not require initialization, otherwise resulting in UB. We currently violate this in `ptrue` and `pzero`, we assume bitmasks for `pselect`, and bitwise operations are applied byte-by-byte in the generic implementations. This is causing issues for scalar types that do require initialization or that contain non-POD info such as pointers (#2201). We either break them, or force specializations of these functions for custom scalars, even if they are not vectorized. Here we modify these functions for scalars only - instead using only scalar operations: - `pzero`: `Scalar(0)` for all scalars. - `ptrue`: `Scalar(1)` for non-trivial scalars, bitset to one bits for trivial scalars. - `pselect`: ternary select comparing mask to `Scalar(0)` for all scalars - `pand`, `por`, `pxor`, `pnot`: use operators `&`, `|`, `^`, `~` for all integer or non-trivial scalars, otherwise apply bytewise. For non-scalar types, the original implementations are used to maintain compatibility and minimize the number of changes. Fixes #2201. (cherry picked from commit 3d98a6ef5ce0ba85acaee4ffffc53f0f21bd8fd2) --- Eigen/src/Core/GenericPacketMath.h | 220 +++++++++++++++++++++++------ Eigen/src/Core/util/XprHelper.h | 14 +- test/AnnoyingScalar.h | 6 +- 3 files changed, 176 insertions(+), 64 deletions(-) diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 53800a005..cf677a190 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -129,6 +129,22 @@ template struct packet_traits : default_packet_traits template struct packet_traits : packet_traits { }; +template struct unpacket_traits +{ + typedef T type; + typedef T half; + enum + { + size = 1, + alignment = 1, + vectorizable = false, + masked_load_available=false, + masked_store_available=false + }; +}; + +template struct unpacket_traits : unpacket_traits { }; + template struct type_casting_traits { enum { VectorizedCast = 0, @@ -154,6 +170,18 @@ struct eigen_packet_wrapper T m_val; }; + +/** \internal A convenience utility for determining if the type is a scalar. + * This is used to enable some generic packet implementations. + */ +template +struct is_scalar { + typedef typename unpacket_traits::type Scalar; + enum { + value = internal::is_same::value + }; +}; + /** \internal \returns static_cast(a) (coeff-wise) */ template EIGEN_DEVICE_FUNC inline TgtPacket @@ -215,13 +243,59 @@ pmul(const bool& a, const bool& b) { return a && b; } template EIGEN_DEVICE_FUNC inline Packet pdiv(const Packet& a, const Packet& b) { return a/b; } -/** \internal \returns one bits */ +// In the generic case, memset to all one bits. +template +struct ptrue_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/){ + Packet b; + memset(static_cast(&b), 0xff, sizeof(Packet)); + return b; + } +}; + +// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value). +// Although this is technically not a valid bitmask, the scalar path for pselect +// uses a comparison to zero, so this should still work in most cases. We don't +// have another option, since the scalar type requires initialization. +template +struct ptrue_impl::value && NumTraits::RequireInitialization>::type > { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){ + return T(1); + } +}; + +/** \internal \returns one bits. */ template EIGEN_DEVICE_FUNC inline Packet -ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} +ptrue(const Packet& a) { + return ptrue_impl::run(a); +} -/** \internal \returns zero bits */ +// In the general case, memset to zero. +template +struct pzero_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) { + Packet b; + memset(static_cast(&b), 0x00, sizeof(Packet)); + return b; + } +}; + +// For scalars, explicitly set to Scalar(0), since the underlying representation +// for zero may not consist of all-zero bits. +template +struct pzero_impl::value>::type> { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { + return T(0); + } +}; + +/** \internal \returns packet of zeros */ template EIGEN_DEVICE_FUNC inline Packet -pzero(const Packet& /*a*/) { Packet b; memset((void*)&b, 0, sizeof(b)); return b;} +pzero(const Packet& a) { + return pzero_impl::run(a); +} /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet @@ -238,33 +312,6 @@ pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); } -template<> EIGEN_DEVICE_FUNC inline float pzero(const float& a) { - EIGEN_UNUSED_VARIABLE(a) - return 0.f; -} - -template<> EIGEN_DEVICE_FUNC inline double pzero(const double& a) { - EIGEN_UNUSED_VARIABLE(a) - return 0.; -} - -template -EIGEN_DEVICE_FUNC inline std::complex ptrue(const std::complex& /*a*/) { - RealScalar b = ptrue(RealScalar(0)); - return std::complex(b, b); -} - -template -EIGEN_DEVICE_FUNC inline Packet bitwise_helper(const Packet& a, const Packet& b, Op op) { - const unsigned char* a_ptr = reinterpret_cast(&a); - const unsigned char* b_ptr = reinterpret_cast(&b); - Packet c; - unsigned char* c_ptr = reinterpret_cast(&c); - for (size_t i = 0; i < sizeof(Packet); ++i) { - *c_ptr++ = op(*a_ptr++, *b_ptr++); - } - return c; -} template struct bit_and { @@ -287,42 +334,123 @@ struct bit_xor { } }; +template +struct bit_not { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const { + return ~a; + } +}; + +// Use operators &, |, ^, ~. +template +struct operator_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not()(a); } +}; + +// Apply binary operations byte-by-byte +template +struct bytewise_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { + return binary(a, b, bit_and()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { + return binary(a, b, bit_or()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { + return binary(a, b, bit_xor()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { + return unary(a,bit_not()); + } + + private: + template + EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++); + } + return c; + } + + template + EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + const unsigned char* b_ptr = reinterpret_cast(&b); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++, *b_ptr++); + } + return c; + } +}; + +// In the general case, use byte-by-byte manipulation. +template +struct bitwise_helper : public bytewise_bitwise_helper {}; + +// For integers or non-trivial scalars, use binary operators. +template +struct bitwise_helper::value && (NumTraits::IsInteger || NumTraits::RequireInitialization)>::type + > : public operator_bitwise_helper {}; + /** \internal \returns the bitwise and of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet pand(const Packet& a, const Packet& b) { - return bitwise_helper(a, b, bit_and()); + return bitwise_helper::bitwise_and(a, b); } /** \internal \returns the bitwise or of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet por(const Packet& a, const Packet& b) { - return bitwise_helper(a ,b, bit_or()); + return bitwise_helper::bitwise_or(a, b); } /** \internal \returns the bitwise xor of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet pxor(const Packet& a, const Packet& b) { - return bitwise_helper(a ,b, bit_xor()); + return bitwise_helper::bitwise_xor(a, b); +} + +/** \internal \returns the bitwise not of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { + return bitwise_helper::bitwise_not(a); } /** \internal \returns the bitwise and of \a a and not \a b */ template EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return pand(a, pxor(ptrue(b), b)); } +pandnot(const Packet& a, const Packet& b) { return pand(a, pnot(b)); } + +// In the general case, use bitwise select. +template +struct pselect_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return por(pand(a,mask),pandnot(b,mask)); + } +}; + +// For scalars, use ternary select. +template +struct pselect_impl::value>::type > { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return numext::equal_strict(mask, Packet(0)) ? b : a; + } +}; /** \internal \returns \a or \b for each field in packet according to \mask */ template EIGEN_DEVICE_FUNC inline Packet pselect(const Packet& mask, const Packet& a, const Packet& b) { - return por(pand(a,mask),pandnot(b,mask)); -} - -template<> EIGEN_DEVICE_FUNC inline float pselect( - const float& cond, const float& a, const float&b) { - return numext::equal_strict(cond,0.f) ? b : a; -} - -template<> EIGEN_DEVICE_FUNC inline double pselect( - const double& cond, const double& a, const double& b) { - return numext::equal_strict(cond,0.) ? b : a; + return pselect_impl::run(mask, a, b); } template<> EIGEN_DEVICE_FUNC inline bool pselect( diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index f2323174e..71c32b8a1 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -184,19 +184,7 @@ template struct functor_traits template struct packet_traits; -template struct unpacket_traits -{ - typedef T type; - typedef T half; - enum - { - size = 1, - alignment = 1, - vectorizable = false, - masked_load_available=false, - masked_store_available=false - }; -}; +template struct unpacket_traits; template::size)==0 || is_same::half>::value> diff --git a/test/AnnoyingScalar.h b/test/AnnoyingScalar.h index 0f8e70d36..7ace083c5 100644 --- a/test/AnnoyingScalar.h +++ b/test/AnnoyingScalar.h @@ -126,7 +126,7 @@ template<> struct NumTraits : NumTraits { enum { - RequireInitialization = true + RequireInitialization = 1, }; typedef AnnoyingScalar Real; typedef AnnoyingScalar Nested; @@ -145,10 +145,6 @@ bool (isfinite)(const AnnoyingScalar& x) { } namespace internal { - template<> EIGEN_STRONG_INLINE AnnoyingScalar pcmp_eq(const AnnoyingScalar& a, const AnnoyingScalar& b) - { return AnnoyingScalar(pcmp_eq(*a.v, *b.v)); } - template<> EIGEN_STRONG_INLINE AnnoyingScalar pselect(const AnnoyingScalar& mask, const AnnoyingScalar& a, const AnnoyingScalar& b) - { return numext::equal_strict(*mask.v, 0.f) ? b : a; } template<> EIGEN_STRONG_INLINE double cast(const AnnoyingScalar& x) { return double(*x.v); } template<> EIGEN_STRONG_INLINE float cast(const AnnoyingScalar& x) { return *x.v; } } -- GitLab From bb33880e5774104fbb8c7e83b96b1a65dc889425 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Mon, 12 Jul 2021 10:24:44 -0700 Subject: [PATCH 076/266] Fix TriSycl CMake files. This is to enable compiling with the latest trisycl. `FindTriSYCL.cmake` was broken by commit 00f32752, which modified `add_sycl_to_target` for ComputeCPP. This makes the corresponding modifications for trisycl to make them consistent. Also, trisycl now requires c++17. (cherry picked from commit 8cf6cb27baa9607cc00e5dbb42a1c31efda41b74) --- cmake/FindTriSYCL.cmake | 58 +++++++++++++------- unsupported/doc/examples/SYCL/CMakeLists.txt | 3 +- unsupported/test/CMakeLists.txt | 4 +- 3 files changed, 42 insertions(+), 23 deletions(-) diff --git a/cmake/FindTriSYCL.cmake b/cmake/FindTriSYCL.cmake index 41bc2fa89..278596865 100644 --- a/cmake/FindTriSYCL.cmake +++ b/cmake/FindTriSYCL.cmake @@ -57,12 +57,12 @@ mark_as_advanced(TRISYCL_DEBUG_STRUCTORS) mark_as_advanced(TRISYCL_TRACE_KERNEL) #triSYCL definitions -set(CL_SYCL_LANGUAGE_VERSION 220 CACHE VERSION +set(CL_SYCL_LANGUAGE_VERSION 220 CACHE STRING "Host language version to be used by trisYCL (default is: 220)") -set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE VERSION +set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE STRING "Device language version to be used by trisYCL (default is: 220)") -#set(TRISYCL_COMPILE_OPTIONS "-std=c++1z -Wall -Wextra") -set(CMAKE_CXX_STANDARD 14) +# triSYCL now requires c++17 +set(CMAKE_CXX_STANDARD 17) set(CXX_STANDARD_REQUIRED ON) @@ -93,6 +93,20 @@ endif() find_package(Threads REQUIRED) # Find triSYCL directory +if (TRISYCL_INCLUDES AND TRISYCL_LIBRARIES) + set(TRISYCL_FIND_QUIETLY TRUE) +endif () + +find_path(TRISYCL_INCLUDE_DIR + NAMES sycl.hpp + PATHS $ENV{TRISYCLDIR} $ENV{TRISYCLDIR}/include ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES triSYCL +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(TriSYCL DEFAULT_MSG + TRISYCL_INCLUDE_DIR) + if(NOT TRISYCL_INCLUDE_DIR) message(FATAL_ERROR "triSYCL include directory - Not found! (please set TRISYCL_INCLUDE_DIR") @@ -100,36 +114,42 @@ else() message(STATUS "triSYCL include directory - Found ${TRISYCL_INCLUDE_DIR}") endif() +include(CMakeParseArguments) ####################### # add_sycl_to_target ####################### -# -# Sets the proper flags and includes for the target compilation. -# -# targetName : Name of the target to add a SYCL to. -# sourceFile : Source file to be compiled for SYCL. -# binaryDir : Intermediate directory to output the integration header. -# -function(add_sycl_to_target targetName sourceFile binaryDir) +function(add_sycl_to_target) + set(options) + set(one_value_args + TARGET + ) + set(multi_value_args + SOURCES + ) + cmake_parse_arguments(ADD_SYCL_ARGS + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN} + ) # Add include directories to the "#include <>" paths - target_include_directories (${targetName} PUBLIC + target_include_directories (${ADD_SYCL_ARGS_TARGET} PUBLIC ${TRISYCL_INCLUDE_DIR} ${Boost_INCLUDE_DIRS} $<$:${OpenCL_INCLUDE_DIRS}> $<$:${BOOST_COMPUTE_INCPATH}>) - # Link dependencies - target_link_libraries(${targetName} PUBLIC + target_link_libraries(${ADD_SYCL_ARGS_TARGET} $<$:${OpenCL_LIBRARIES}> Threads::Threads $<$:Boost::log> Boost::chrono) - # Compile definitions - target_compile_definitions(${targetName} PUBLIC + target_compile_definitions(${ADD_SYCL_ARGS_TARGET} PUBLIC + EIGEN_SYCL_TRISYCL $<$:TRISYCL_NO_ASYNC> $<$:TRISYCL_OPENCL> $<$:TRISYCL_DEBUG> @@ -138,13 +158,13 @@ function(add_sycl_to_target targetName sourceFile binaryDir) $<$:BOOST_LOG_DYN_LINK>) # C++ and OpenMP requirements - target_compile_options(${targetName} PUBLIC + target_compile_options(${ADD_SYCL_ARGS_TARGET} PUBLIC ${TRISYCL_COMPILE_OPTIONS} $<$:${OpenMP_CXX_FLAGS}>) if(${TRISYCL_OPENMP} AND (NOT WIN32)) # Does not support generator expressions - set_target_properties(${targetName} + set_target_properties(${ADD_SYCL_ARGS_TARGET} PROPERTIES LINK_FLAGS ${OpenMP_CXX_FLAGS}) endif() diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt index bef4f1925..1d0f721dc 100644 --- a/unsupported/doc/examples/SYCL/CMakeLists.txt +++ b/unsupported/doc/examples/SYCL/CMakeLists.txt @@ -3,8 +3,7 @@ FILE(GLOB examples_SRCS "*.cpp") set(EIGEN_SYCL ON) list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread) if(EIGEN_SYCL_TRISYCL) - set(CMAKE_CXX_STANDARD 14) - set(STD_CXX_FLAG "-std=c++1z") + set(CMAKE_CXX_STANDARD 17) else(EIGEN_SYCL_TRISYCL) if(MSVC) # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11 diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 181919361..ba890c7d4 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -165,8 +165,8 @@ if(EIGEN_TEST_CXX11) endif() if(EIGEN_SYCL_TRISYCL) - set(CMAKE_CXX_STANDARD 14) - set(STD_CXX_FLAG "-std=c++1z") + # triSYCL now requires c++17. + set(CMAKE_CXX_STANDARD 17) else() if(MSVC) # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11 -- GitLab From 9a1691a14ef321c7062461cba1369fcbc31dc0f5 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 16 Jul 2021 08:24:23 -0700 Subject: [PATCH 077/266] Fix cmake warnings, FindPASTIX/FindPTSCOTCH. We were getting a lot of warnings due to nested `find_package` calls within `Find***.cmake` files. The recommended approach is to use [`find_dependency`](https://cmake.org/cmake/help/latest/module/CMakeFindDependencyMacro.html) in package configuration files. I made this change for all instances. Case mismatches between `Find.cmake` and calling `find_package(`) also lead to warnings. Fixed for `FindPASTIX.cmake` and `FindSCOTCH.cmake`. `FindBLASEXT.cmake` was broken due to calling `find_package_handle_standard_args(BLAS ...)`. The package name must match, otherwise the `find_package(BLASEXT)` falsely thinks the package wasn't found. I changed to `BLASEXT`, but then also copied that value to `BLAS_FOUND` for compatibility. `FindPastix.cmake` had a typo that incorrectly added `PTSCOTCH` when looking for the `SCOTCH` component. `FindPTSCOTCH` incorrectly added `***-NOTFOUND` to include/library lists, corrupting them. This led to cmake errors down-the-line. Fixes #2288. (cherry picked from commit 1cdec386530c6b844389b96c199e723a1e4e71c7) --- cmake/FindBLAS.cmake | 5 +-- cmake/FindBLASEXT.cmake | 28 ++++++++------- cmake/FindComputeCpp.cmake | 3 +- cmake/FindFFTW.cmake | 3 +- cmake/FindHWLOC.cmake | 5 +-- cmake/FindLAPACK.cmake | 7 ++-- cmake/{FindPastix.cmake => FindPASTIX.cmake} | 36 ++++++++++---------- cmake/FindPTSCOTCH.cmake | 33 +++++++++--------- cmake/{FindScotch.cmake => FindSCOTCH.cmake} | 5 +-- cmake/FindTriSYCL.cmake | 9 ++--- 10 files changed, 72 insertions(+), 62 deletions(-) rename cmake/{FindPastix.cmake => FindPASTIX.cmake} (96%) rename cmake/{FindScotch.cmake => FindSCOTCH.cmake} (99%) diff --git a/cmake/FindBLAS.cmake b/cmake/FindBLAS.cmake index 7d1f81b03..1bb8f1965 100644 --- a/cmake/FindBLAS.cmake +++ b/cmake/FindBLAS.cmake @@ -147,6 +147,7 @@ mark_as_advanced(BLAS_VERBOSE) include(CheckFunctionExists) include(CheckFortranFunctionExists) +include(CMakeFindDependencyMacro) set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) @@ -509,9 +510,9 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All") if (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX) if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED) - find_package(Threads) + find_dependency(Threads) else() - find_package(Threads REQUIRED) + find_dependency(Threads REQUIRED) endif() set(BLAS_SEARCH_LIBS "") diff --git a/cmake/FindBLASEXT.cmake b/cmake/FindBLASEXT.cmake index 0fe7fb849..69a941897 100644 --- a/cmake/FindBLASEXT.cmake +++ b/cmake/FindBLASEXT.cmake @@ -41,18 +41,19 @@ # License text for the above reference.) # macro to factorize this call +include(CMakeFindDependencyMacro) macro(find_package_blas) if(BLASEXT_FIND_REQUIRED) if(BLASEXT_FIND_QUIETLY) - find_package(BLAS REQUIRED QUIET) + find_dependency(BLAS REQUIRED QUIET) else() - find_package(BLAS REQUIRED) + find_dependency(BLAS REQUIRED) endif() else() if(BLASEXT_FIND_QUIETLY) - find_package(BLAS QUIET) + find_dependency(BLAS QUIET) else() - find_package(BLAS) + find_dependency(BLAS) endif() endif() endmacro() @@ -316,7 +317,7 @@ if(BLA_VENDOR MATCHES "Intel*") "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS BLAS_INCLUDE_DIRS) @@ -324,14 +325,14 @@ if(BLA_VENDOR MATCHES "Intel*") if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_PAR_LIBRARIES) endif() else() if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS BLAS_INCLUDE_DIRS) @@ -343,14 +344,14 @@ elseif(BLA_VENDOR MATCHES "ACML*") "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS) if(BLAS_PAR_LIBRARIES) if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_PAR_LIBRARIES) endif() elseif(BLA_VENDOR MATCHES "IBMESSL*") @@ -360,21 +361,24 @@ elseif(BLA_VENDOR MATCHES "IBMESSL*") "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS) if(BLAS_PAR_LIBRARIES) if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_PAR_LIBRARIES) endif() else() if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS) endif() + +# Callers expect BLAS_FOUND to be set as well. +set(BLAS_FOUND BLASEXT_FOUND) diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index 3cca5150e..1c271f0fe 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -41,7 +41,8 @@ set(COMPUTECPP_BITCODE "spir64" CACHE STRING "Bitcode type to use as SYCL target in compute++") mark_as_advanced(COMPUTECPP_BITCODE) -find_package(OpenCL REQUIRED) +include(CMakeFindDependencyMacro) +find_dependency(OpenCL REQUIRED) # Find ComputeCpp package diff --git a/cmake/FindFFTW.cmake b/cmake/FindFFTW.cmake index fad476d0d..ed55c5fad 100644 --- a/cmake/FindFFTW.cmake +++ b/cmake/FindFFTW.cmake @@ -22,7 +22,8 @@ if( NOT FFTW_ROOT AND ENV{FFTWDIR} ) endif() # Check if we can use PkgConfig -find_package(PkgConfig) +include(CMakeFindDependencyMacro) +find_dependency(PkgConfig) #Determine from PKG if( PKG_CONFIG_FOUND AND NOT FFTW_ROOT ) diff --git a/cmake/FindHWLOC.cmake b/cmake/FindHWLOC.cmake index 483291518..522f52157 100644 --- a/cmake/FindHWLOC.cmake +++ b/cmake/FindHWLOC.cmake @@ -65,8 +65,9 @@ endif() # Optionally use pkg-config to detect include/library dirs (if pkg-config is available) # ------------------------------------------------------------------------------------- -include(FindPkgConfig) -find_package(PkgConfig QUIET) +include(CMakeFindDependencyMacro) +# include(FindPkgConfig) +find_dependency(PkgConfig QUIET) if( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER ) pkg_search_module(HWLOC hwloc) diff --git a/cmake/FindLAPACK.cmake b/cmake/FindLAPACK.cmake index 284a4529c..3fd738807 100644 --- a/cmake/FindLAPACK.cmake +++ b/cmake/FindLAPACK.cmake @@ -26,6 +26,7 @@ include(CheckFunctionExists) +include(CMakeFindDependencyMacro) # This macro checks for the existence of the combination of fortran libraries # given by _list. If the combination is found, this macro checks (using the @@ -88,7 +89,7 @@ macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _b set(${LIBRARIES} ${_libraries_found}) # Some C++ linkers require the f2c library to link with Fortran libraries. # I do not know which ones, thus I just add the f2c library if it is available. - find_package( F2C QUIET ) + find_dependency( F2C QUIET ) if ( F2C_FOUND ) set(${DEFINITIONS} ${${DEFINITIONS}} ${F2C_DEFINITIONS}) set(${LIBRARIES} ${${LIBRARIES}} ${F2C_LIBRARIES}) @@ -135,9 +136,9 @@ endmacro() # LAPACK requires BLAS if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED) - find_package(BLAS) + find_dependency(BLAS) else() - find_package(BLAS REQUIRED) + find_dependency(BLAS REQUIRED) endif() if (NOT BLAS_FOUND) diff --git a/cmake/FindPastix.cmake b/cmake/FindPASTIX.cmake similarity index 96% rename from cmake/FindPastix.cmake rename to cmake/FindPASTIX.cmake index 3b47d5ce3..db1427b0a 100644 --- a/cmake/FindPastix.cmake +++ b/cmake/FindPASTIX.cmake @@ -118,7 +118,7 @@ if( PASTIX_FIND_COMPONENTS ) if (${component} STREQUAL "SCOTCH") set(PASTIX_LOOK_FOR_SCOTCH ON) endif() - if (${component} STREQUAL "SCOTCH") + if (${component} STREQUAL "PTSCOTCH") set(PASTIX_LOOK_FOR_PTSCOTCH ON) endif() if (${component} STREQUAL "METIS") @@ -133,14 +133,14 @@ endif() # Required dependencies # --------------------- - +include(CMakeFindDependencyMacro) if (NOT PASTIX_FIND_QUIETLY) message(STATUS "Looking for PASTIX - Try to detect pthread") endif() if (PASTIX_FIND_REQUIRED) - find_package(Threads REQUIRED QUIET) + find_dependency(Threads REQUIRED QUIET) else() - find_package(Threads QUIET) + find_dependency(Threads QUIET) endif() set(PASTIX_EXTRA_LIBRARIES "") if( THREADS_FOUND ) @@ -198,9 +198,9 @@ if (NOT PASTIX_FIND_QUIETLY) message(STATUS "Looking for PASTIX - Try to detect HWLOC") endif() if (PASTIX_FIND_REQUIRED) - find_package(HWLOC REQUIRED QUIET) + find_dependency(HWLOC REQUIRED QUIET) else() - find_package(HWLOC QUIET) + find_dependency(HWLOC QUIET) endif() # PASTIX depends on BLAS @@ -209,9 +209,9 @@ if (NOT PASTIX_FIND_QUIETLY) message(STATUS "Looking for PASTIX - Try to detect BLAS") endif() if (PASTIX_FIND_REQUIRED) - find_package(BLASEXT REQUIRED QUIET) + find_dependency(BLASEXT REQUIRED QUIET) else() - find_package(BLASEXT QUIET) + find_dependency(BLASEXT QUIET) endif() # Optional dependencies @@ -230,9 +230,9 @@ if (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI) set(MPI_C_COMPILER mpicc) endif() if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_MPI) - find_package(MPI REQUIRED QUIET) + find_dependency(MPI REQUIRED QUIET) else() - find_package(MPI QUIET) + find_dependency(MPI QUIET) endif() if (MPI_FOUND) mark_as_advanced(MPI_LIBRARY) @@ -272,10 +272,10 @@ if( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU) endif() # set the list of optional dependencies we may discover if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_STARPU) - find_package(STARPU ${PASTIX_STARPU_VERSION} REQUIRED + find_dependency(STARPU ${PASTIX_STARPU_VERSION} REQUIRED COMPONENTS ${STARPU_COMPONENT_LIST}) else() - find_package(STARPU ${PASTIX_STARPU_VERSION} + find_dependency(STARPU ${PASTIX_STARPU_VERSION} COMPONENTS ${STARPU_COMPONENT_LIST}) endif() @@ -288,9 +288,9 @@ if (NOT SCOTCH_FOUND AND PASTIX_LOOK_FOR_SCOTCH) message(STATUS "Looking for PASTIX - Try to detect SCOTCH") endif() if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_SCOTCH) - find_package(SCOTCH REQUIRED QUIET) + find_dependency(SCOTCH REQUIRED QUIET) else() - find_package(SCOTCH QUIET) + find_dependency(SCOTCH QUIET) endif() endif() @@ -301,9 +301,9 @@ if (NOT PTSCOTCH_FOUND AND PASTIX_LOOK_FOR_PTSCOTCH) message(STATUS "Looking for PASTIX - Try to detect PTSCOTCH") endif() if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_PTSCOTCH) - find_package(PTSCOTCH REQUIRED QUIET) + find_dependency(PTSCOTCH REQUIRED QUIET) else() - find_package(PTSCOTCH QUIET) + find_dependency(PTSCOTCH QUIET) endif() endif() @@ -314,9 +314,9 @@ if (NOT METIS_FOUND AND PASTIX_LOOK_FOR_METIS) message(STATUS "Looking for PASTIX - Try to detect METIS") endif() if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_METIS) - find_package(METIS REQUIRED QUIET) + find_dependency(METIS REQUIRED QUIET) else() - find_package(METIS QUIET) + find_dependency(METIS QUIET) endif() endif() diff --git a/cmake/FindPTSCOTCH.cmake b/cmake/FindPTSCOTCH.cmake index 51eecf1af..6ccc743e6 100644 --- a/cmake/FindPTSCOTCH.cmake +++ b/cmake/FindPTSCOTCH.cmake @@ -79,20 +79,21 @@ if( PTSCOTCH_FIND_COMPONENTS ) endif() # PTSCOTCH depends on Threads, try to find it +include(CMakeFindDependencyMacro) if (NOT THREADS_FOUND) if (PTSCOTCH_FIND_REQUIRED) - find_package(Threads REQUIRED) + find_dependency(Threads REQUIRED) else() - find_package(Threads) + find_dependency(Threads) endif() endif() # PTSCOTCH depends on MPI, try to find it if (NOT MPI_FOUND) if (PTSCOTCH_FIND_REQUIRED) - find_package(MPI REQUIRED) + find_dependency(MPI REQUIRED) else() - find_package(MPI) + find_dependency(MPI) endif() endif() @@ -148,18 +149,18 @@ else() foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") find_path(PTSCOTCH_${ptscotch_hdr}_DIRS - NAMES ${ptscotch_hdr} - HINTS ${PTSCOTCH_DIR} - PATH_SUFFIXES "include" "include/scotch") + NAMES ${ptscotch_hdr} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES "include" "include/scotch") mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) endforeach() else() foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") find_path(PTSCOTCH_${ptscotch_hdr}_DIRS - NAMES ${ptscotch_hdr} - HINTS ${_inc_env} - PATH_SUFFIXES "scotch") + NAMES ${ptscotch_hdr} + HINTS ${_inc_env} + PATH_SUFFIXES "scotch") mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) endforeach() endif() @@ -171,7 +172,6 @@ foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) if (PTSCOTCH_${ptscotch_hdr}_DIRS) list(APPEND PTSCOTCH_INCLUDE_DIRS "${PTSCOTCH_${ptscotch_hdr}_DIRS}") else () - set(PTSCOTCH_INCLUDE_DIRS "PTSCOTCH_INCLUDE_DIRS-NOTFOUND") if (NOT PTSCOTCH_FIND_QUIETLY) message(STATUS "Looking for ptscotch -- ${ptscotch_hdr} not found") endif() @@ -229,16 +229,16 @@ else() foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY - NAMES ${ptscotch_lib} - HINTS ${PTSCOTCH_DIR} - PATH_SUFFIXES lib lib32 lib64) + NAMES ${ptscotch_lib} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES lib lib32 lib64) endforeach() else() foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY - NAMES ${ptscotch_lib} - HINTS ${_lib_env}) + NAMES ${ptscotch_lib} + HINTS ${_lib_env}) endforeach() endif() endif() @@ -255,7 +255,6 @@ foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}") list(APPEND PTSCOTCH_LIBRARY_DIRS "${${ptscotch_lib}_lib_path}") else () - list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}") if (NOT PTSCOTCH_FIND_QUIETLY) message(STATUS "Looking for ptscotch -- lib ${ptscotch_lib} not found") endif() diff --git a/cmake/FindScotch.cmake b/cmake/FindSCOTCH.cmake similarity index 99% rename from cmake/FindScotch.cmake rename to cmake/FindSCOTCH.cmake index af00eb0f2..11b971a92 100644 --- a/cmake/FindScotch.cmake +++ b/cmake/FindSCOTCH.cmake @@ -71,11 +71,12 @@ if( SCOTCH_FIND_COMPONENTS ) endif() # SCOTCH may depend on Threads, try to find it +include(CMakeFindDependencyMacro) if (NOT THREADS_FOUND) if (SCOTCH_FIND_REQUIRED) - find_package(Threads REQUIRED) + find_dependency(Threads REQUIRED) else() - find_package(Threads) + find_dependency(Threads) endif() endif() diff --git a/cmake/FindTriSYCL.cmake b/cmake/FindTriSYCL.cmake index 278596865..810423907 100644 --- a/cmake/FindTriSYCL.cmake +++ b/cmake/FindTriSYCL.cmake @@ -67,8 +67,9 @@ set(CXX_STANDARD_REQUIRED ON) # Find OpenCL package +include(CMakeFindDependencyMacro) if(TRISYCL_OPENCL) - find_package(OpenCL REQUIRED) + find_dependency(OpenCL REQUIRED) if(UNIX) set(BOOST_COMPUTE_INCPATH /usr/include/compute CACHE PATH "Path to Boost.Compute headers (default is: /usr/include/compute)") @@ -77,11 +78,11 @@ endif() # Find OpenMP package if(TRISYCL_OPENMP) - find_package(OpenMP REQUIRED) + find_dependency(OpenMP REQUIRED) endif() # Find Boost -find_package(Boost 1.58 REQUIRED COMPONENTS chrono log) +find_dependency(Boost 1.58 REQUIRED COMPONENTS chrono log) # If debug or trace we need boost log if(TRISYCL_DEBUG OR TRISYCL_DEBUG_STRUCTORS OR TRISYCL_TRACE_KERNEL) @@ -90,7 +91,7 @@ else() set(LOG_NEEDED OFF) endif() -find_package(Threads REQUIRED) +find_dependency(Threads REQUIRED) # Find triSYCL directory if (TRISYCL_INCLUDES AND TRISYCL_LIBRARIES) -- GitLab From 46ecdcd7451824e3ca26d83534caea019c1b7862 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 16 Jul 2021 13:45:15 -0700 Subject: [PATCH 078/266] Fix MPReal detection and support. The latest version of `mpreal` has a bug that breaks `min`/`max`. It also breaks with the latest dev version of `mpfr`. Here we add `FindMPREAL.cmake` which searches for the library and tests if compilation works. Removed our internal copy of `mpreal.h` under `unsupported/test`, as it is out-of-sync with the latest, and similarly breaks with the latest `mpfr`. It would be best to use the installed version of `mpreal` anyways, since that's what we actually want to test. Fixes #2282. (cherry picked from commit 31f796ebef35eeadd0e26878aab3fe99ca412a45) --- cmake/FindMPREAL.cmake | 103 + unsupported/test/CMakeLists.txt | 10 +- unsupported/test/mpreal/mpreal.h | 3184 --------------------------- unsupported/test/mpreal_support.cpp | 1 + 4 files changed, 108 insertions(+), 3190 deletions(-) create mode 100644 cmake/FindMPREAL.cmake delete mode 100644 unsupported/test/mpreal/mpreal.h diff --git a/cmake/FindMPREAL.cmake b/cmake/FindMPREAL.cmake new file mode 100644 index 000000000..947a1ce88 --- /dev/null +++ b/cmake/FindMPREAL.cmake @@ -0,0 +1,103 @@ +# Try to find the MPFR C++ (MPREAL) library +# See http://www.holoborodko.com/pavel/mpreal/ +# +# This module supports requiring a minimum version, e.g. you can do +# find_package(MPREAL 1.8.6) +# to require version 1.8.6 or newer of MPREAL C++. +# +# Once done this will define +# +# MPREAL_FOUND - system has MPREAL lib with correct version +# MPREAL_INCLUDES - MPREAL required include directories +# MPREAL_LIBRARIES - MPREAL required libraries +# MPREAL_VERSION - MPREAL version + +# Copyright (c) 2020 The Eigen Authors. +# Redistribution and use is allowed according to the terms of the BSD license. + +include(CMakeFindDependencyMacro) +find_dependency(MPFR) +find_dependency(GMP) + +# Set MPREAL_INCLUDES +find_path(MPREAL_INCLUDES + NAMES + mpreal.h + PATHS + $ENV{GMPDIR} + ${INCLUDE_INSTALL_DIR} +) + +# Set MPREAL_FIND_VERSION to 1.0.0 if no minimum version is specified + +if(NOT MPREAL_FIND_VERSION) + if(NOT MPREAL_FIND_VERSION_MAJOR) + set(MPREAL_FIND_VERSION_MAJOR 1) + endif() + if(NOT MPREAL_FIND_VERSION_MINOR) + set(MPREAL_FIND_VERSION_MINOR 0) + endif() + if(NOT MPREAL_FIND_VERSION_PATCH) + set(MPREAL_FIND_VERSION_PATCH 0) + endif() + + set(MPREAL_FIND_VERSION "${MPREAL_FIND_VERSION_MAJOR}.${MPREAL_FIND_VERSION_MINOR}.${MPREAL_FIND_VERSION_PATCH}") +endif() + +# Check bugs +# - https://github.com/advanpix/mpreal/issues/7 +# - https://github.com/advanpix/mpreal/issues/9 +set(MPREAL_TEST_PROGRAM " +#include +#include +int main(int argc, char** argv) { + const mpfr::mpreal one = 1.0; + const mpfr::mpreal zero = 0.0; + using namespace std; + const mpfr::mpreal smaller = min(one, zero); + return 0; +}") + +if(MPREAL_INCLUDES) + + # Set MPREAL_VERSION + + file(READ "${MPREAL_INCLUDES}/mpreal.h" _mpreal_version_header) + + string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MAJOR[ \t]+([0-9]+)" _mpreal_major_version_match "${_mpreal_version_header}") + set(MPREAL_MAJOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MINOR[ \t]+([0-9]+)" _mpreal_minor_version_match "${_mpreal_version_header}") + set(MPREAL_MINOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+MPREAL_VERSION_PATCHLEVEL[ \t]+([0-9]+)" _mpreal_patchlevel_version_match "${_mpreal_version_header}") + set(MPREAL_PATCHLEVEL_VERSION "${CMAKE_MATCH_1}") + + set(MPREAL_VERSION ${MPREAL_MAJOR_VERSION}.${MPREAL_MINOR_VERSION}.${MPREAL_PATCHLEVEL_VERSION}) + + # Check whether found version exceeds minimum version + + if(${MPREAL_VERSION} VERSION_LESS ${MPREAL_FIND_VERSION}) + set(MPREAL_VERSION_OK FALSE) + message(STATUS "MPREAL version ${MPREAL_VERSION} found in ${MPREAL_INCLUDES}, " + "but at least version ${MPREAL_FIND_VERSION} is required") + else() + set(MPREAL_VERSION_OK TRUE) + + list(APPEND MPREAL_INCLUDES "${MPFR_INCLUDES}" "${GMP_INCLUDES}") + list(REMOVE_DUPLICATES MPREAL_INCLUDES) + + list(APPEND MPREAL_LIBRARIES "${MPFR_LIBRARIES}" "${GMP_LIBRARIES}") + list(REMOVE_DUPLICATES MPREAL_LIBRARIES) + + # Make sure it compiles with the current compiler. + unset(MPREAL_WORKS CACHE) + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_INCLUDES "${MPREAL_INCLUDES}") + set(CMAKE_REQUIRED_LIBRARIES "${MPREAL_LIBRARIES}") + check_cxx_source_compiles("${MPREAL_TEST_PROGRAM}" MPREAL_WORKS) + endif() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(MPREAL DEFAULT_MSG + MPREAL_INCLUDES MPREAL_VERSION_OK MPREAL_WORKS) +mark_as_advanced(MPREAL_INCLUDES) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index ba890c7d4..d30fa62bd 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -55,13 +55,11 @@ ei_add_test(FFT) ei_add_test(EulerAngles) -find_package(MPFR 2.3.0) -find_package(GMP) -if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11) - include_directories(${MPFR_INCLUDES} ./mpreal) +find_package(MPREAL) +if(MPREAL_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11) ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ") - set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) - ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" ) + include_directories(${MPREAL_INCLUDES}) + ei_add_test(mpreal_support "-std=c++11" "${MPREAL_LIBRARIES}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ") endif() diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h deleted file mode 100644 index 5cfd66a10..000000000 --- a/unsupported/test/mpreal/mpreal.h +++ /dev/null @@ -1,3184 +0,0 @@ -/* - MPFR C++: Multi-precision floating point number class for C++. - Based on MPFR library: http://mpfr.org - - Project homepage: http://www.holoborodko.com/pavel/mpfr - Contact e-mail: pavel@holoborodko.com - - Copyright (c) 2008-2016 Pavel Holoborodko - - Contributors: - Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman, - Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen, - Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng, - Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood, - Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow, - Rodney James, Jorge Leitao, Jerome Benoit. - - Licensing: - (A) MPFR C++ is under GNU General Public License ("GPL"). - - (B) Non-free licenses may also be purchased from the author, for users who - do not want their programs protected by the GPL. - - The non-free licenses are for users that wish to use MPFR C++ in - their products but are unwilling to release their software - under the GPL (which would require them to release source code - and allow free redistribution). - - Such users can purchase an unlimited-use license from the author. - Contact us for more details. - - GNU General Public License ("GPL") copyright permissions statement: - ************************************************************************** - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef __MPREAL_H__ -#define __MPREAL_H__ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Options -#define MPREAL_HAVE_MSVC_DEBUGVIEW // Enable Debugger Visualizer for "Debug" builds in MSVC. -#define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS // Enable extended std::numeric_limits specialization. - // Meaning that "digits", "round_style" and similar members are defined as functions, not constants. - // See std::numeric_limits at the end of the file for more information. - -// Library version -#define MPREAL_VERSION_MAJOR 3 -#define MPREAL_VERSION_MINOR 6 -#define MPREAL_VERSION_PATCHLEVEL 5 -#define MPREAL_VERSION_STRING "3.6.5" - -// Detect compiler using signatures from http://predef.sourceforge.net/ -#if defined(__GNUC__) && defined(__INTEL_COMPILER) - #define IsInf(x) isinf EIGEN_NOT_A_MACRO (x) // Intel ICC compiler on Linux - -#elif defined(_MSC_VER) // Microsoft Visual C++ - #define IsInf(x) (!_finite(x)) - -#else - #define IsInf(x) std::isinf EIGEN_NOT_A_MACRO (x) // GNU C/C++ (and/or other compilers), just hope for C99 conformance -#endif - -// A Clang feature extension to determine compiler features. -#ifndef __has_feature - #define __has_feature(x) 0 -#endif - -// Detect support for r-value references (move semantic). -// Move semantic should be enabled with great care in multi-threading environments, -// especially if MPFR uses custom memory allocators. -// Everything should be thread-safe and support passing ownership over thread boundary. -#if (__has_feature(cxx_rvalue_references) || \ - defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \ - (defined(_MSC_VER) && _MSC_VER >= 1600) && !defined(MPREAL_DISABLE_MOVE_SEMANTIC)) - - #define MPREAL_HAVE_MOVE_SUPPORT - - // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization - #define mpfr_is_initialized(x) (0 != (x)->_mpfr_d) - #define mpfr_set_uninitialized(x) ((x)->_mpfr_d = 0 ) -#endif - -// Detect support for explicit converters. -#if (__has_feature(cxx_explicit_conversions) || \ - (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \ - (defined(_MSC_VER) && _MSC_VER >= 1800) || \ - (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1300)) - - #define MPREAL_HAVE_EXPLICIT_CONVERTERS -#endif - -#define MPFR_USE_INTMAX_T // Enable 64-bit integer types - should be defined before mpfr.h - -#if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG) - #define MPREAL_MSVC_DEBUGVIEW_CODE DebugView = toString(); - #define MPREAL_MSVC_DEBUGVIEW_DATA std::string DebugView; -#else - #define MPREAL_MSVC_DEBUGVIEW_CODE - #define MPREAL_MSVC_DEBUGVIEW_DATA -#endif - -#include - -#if (MPFR_VERSION < MPFR_VERSION_NUM(3,0,0)) - #include // Needed for random() -#endif - -// Less important options -#define MPREAL_DOUBLE_BITS_OVERFLOW -1 // Triggers overflow exception during conversion to double if mpreal - // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits - // = -1 disables overflow checks (default) - -// Fast replacement for mpfr_set_zero(x, +1): -// (a) uses low-level data members, might not be forward compatible -// (b) sign is not set, add (x)->_mpfr_sign = 1; -#define mpfr_set_zero_fast(x) ((x)->_mpfr_exp = __MPFR_EXP_ZERO) - -#if defined(__GNUC__) - #define MPREAL_PERMISSIVE_EXPR __extension__ -#else - #define MPREAL_PERMISSIVE_EXPR -#endif - -namespace mpfr { - -class mpreal { -private: - mpfr_t mp; - -public: - - // Get default rounding mode & precision - inline static mp_rnd_t get_default_rnd() { return (mp_rnd_t)(mpfr_get_default_rounding_mode()); } - inline static mp_prec_t get_default_prec() { return (mpfr_get_default_prec)(); } - - // Constructors && type conversions - mpreal(); - mpreal(const mpreal& u); - mpreal(const mpf_t u); - mpreal(const mpz_t u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const mpq_t u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const double u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const long double u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const unsigned long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const unsigned int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - - // Construct mpreal from mpfr_t structure. - // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers. - mpreal(const mpfr_t u, bool shared = false); - - mpreal(const char* s, mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const std::string& s, mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd()); - - ~mpreal(); - -#ifdef MPREAL_HAVE_MOVE_SUPPORT - mpreal& operator=(mpreal&& v); - mpreal(mpreal&& u); -#endif - - // Operations - // = - // +, -, *, /, ++, --, <<, >> - // *=, +=, -=, /=, - // <, >, ==, <=, >= - - // = - mpreal& operator=(const mpreal& v); - mpreal& operator=(const mpf_t v); - mpreal& operator=(const mpz_t v); - mpreal& operator=(const mpq_t v); - mpreal& operator=(const long double v); - mpreal& operator=(const double v); - mpreal& operator=(const unsigned long int v); - mpreal& operator=(const unsigned long long int v); - mpreal& operator=(const long long int v); - mpreal& operator=(const unsigned int v); - mpreal& operator=(const long int v); - mpreal& operator=(const int v); - mpreal& operator=(const char* s); - mpreal& operator=(const std::string& s); - template mpreal& operator= (const std::complex& z); - - // + - mpreal& operator+=(const mpreal& v); - mpreal& operator+=(const mpf_t v); - mpreal& operator+=(const mpz_t v); - mpreal& operator+=(const mpq_t v); - mpreal& operator+=(const long double u); - mpreal& operator+=(const double u); - mpreal& operator+=(const unsigned long int u); - mpreal& operator+=(const unsigned int u); - mpreal& operator+=(const long int u); - mpreal& operator+=(const int u); - - mpreal& operator+=(const long long int u); - mpreal& operator+=(const unsigned long long int u); - mpreal& operator-=(const long long int u); - mpreal& operator-=(const unsigned long long int u); - mpreal& operator*=(const long long int u); - mpreal& operator*=(const unsigned long long int u); - mpreal& operator/=(const long long int u); - mpreal& operator/=(const unsigned long long int u); - - const mpreal operator+() const; - mpreal& operator++ (); - const mpreal operator++ (int); - - // - - mpreal& operator-=(const mpreal& v); - mpreal& operator-=(const mpz_t v); - mpreal& operator-=(const mpq_t v); - mpreal& operator-=(const long double u); - mpreal& operator-=(const double u); - mpreal& operator-=(const unsigned long int u); - mpreal& operator-=(const unsigned int u); - mpreal& operator-=(const long int u); - mpreal& operator-=(const int u); - const mpreal operator-() const; - friend const mpreal operator-(const unsigned long int b, const mpreal& a); - friend const mpreal operator-(const unsigned int b, const mpreal& a); - friend const mpreal operator-(const long int b, const mpreal& a); - friend const mpreal operator-(const int b, const mpreal& a); - friend const mpreal operator-(const double b, const mpreal& a); - mpreal& operator-- (); - const mpreal operator-- (int); - - // * - mpreal& operator*=(const mpreal& v); - mpreal& operator*=(const mpz_t v); - mpreal& operator*=(const mpq_t v); - mpreal& operator*=(const long double v); - mpreal& operator*=(const double v); - mpreal& operator*=(const unsigned long int v); - mpreal& operator*=(const unsigned int v); - mpreal& operator*=(const long int v); - mpreal& operator*=(const int v); - - // / - mpreal& operator/=(const mpreal& v); - mpreal& operator/=(const mpz_t v); - mpreal& operator/=(const mpq_t v); - mpreal& operator/=(const long double v); - mpreal& operator/=(const double v); - mpreal& operator/=(const unsigned long int v); - mpreal& operator/=(const unsigned int v); - mpreal& operator/=(const long int v); - mpreal& operator/=(const int v); - friend const mpreal operator/(const unsigned long int b, const mpreal& a); - friend const mpreal operator/(const unsigned int b, const mpreal& a); - friend const mpreal operator/(const long int b, const mpreal& a); - friend const mpreal operator/(const int b, const mpreal& a); - friend const mpreal operator/(const double b, const mpreal& a); - - //<<= Fast Multiplication by 2^u - mpreal& operator<<=(const unsigned long int u); - mpreal& operator<<=(const unsigned int u); - mpreal& operator<<=(const long int u); - mpreal& operator<<=(const int u); - - //>>= Fast Division by 2^u - mpreal& operator>>=(const unsigned long int u); - mpreal& operator>>=(const unsigned int u); - mpreal& operator>>=(const long int u); - mpreal& operator>>=(const int u); - - // Type Conversion operators - bool toBool ( ) const; - long toLong (mp_rnd_t mode = GMP_RNDZ) const; - unsigned long toULong (mp_rnd_t mode = GMP_RNDZ) const; - long long toLLong (mp_rnd_t mode = GMP_RNDZ) const; - unsigned long long toULLong (mp_rnd_t mode = GMP_RNDZ) const; - float toFloat (mp_rnd_t mode = GMP_RNDN) const; - double toDouble (mp_rnd_t mode = GMP_RNDN) const; - long double toLDouble (mp_rnd_t mode = GMP_RNDN) const; - -#if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS) - explicit operator bool () const { return toBool(); } - explicit operator int () const { return int(toLong()); } - explicit operator long () const { return toLong(); } - explicit operator long long () const { return toLLong(); } - explicit operator unsigned () const { return unsigned(toULong()); } - explicit operator unsigned long () const { return toULong(); } - explicit operator unsigned long long () const { return toULLong(); } - explicit operator float () const { return toFloat(); } - explicit operator double () const { return toDouble(); } - explicit operator long double () const { return toLDouble(); } -#endif - - // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions - ::mpfr_ptr mpfr_ptr(); - ::mpfr_srcptr mpfr_ptr() const; - ::mpfr_srcptr mpfr_srcptr() const; - - // Convert mpreal to string with n significant digits in base b - // n = -1 -> convert with the maximum available digits - std::string toString(int n = -1, int b = 10, mp_rnd_t mode = mpreal::get_default_rnd()) const; - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - std::string toString(const std::string& format) const; -#endif - - std::ostream& output(std::ostream& os) const; - - // Math Functions - friend const mpreal sqr (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sqrt(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sqrt(const unsigned long int v, mp_rnd_t rnd_mode); - friend const mpreal cbrt(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal root(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode); - friend const mpreal pow (const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode); - friend const mpreal pow (const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode); - friend const mpreal pow (const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode); - friend const mpreal pow (const mpreal& a, const long int b, mp_rnd_t rnd_mode); - friend const mpreal pow (const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode); - friend const mpreal pow (const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode); - friend const mpreal fabs(const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal abs(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode); - friend inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode); - friend inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode); - friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode); - friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode); - friend int cmpabs(const mpreal& a,const mpreal& b); - - friend const mpreal log (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal exp (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal expm1(const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal nextpow2(const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal cos(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sin(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal tan(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sec(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal csc(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal cot(const mpreal& v, mp_rnd_t rnd_mode); - friend int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal acos (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal asin (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal atan (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode); - friend const mpreal acot (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal asec (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal acsc (const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal cosh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sinh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal tanh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sech (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal csch (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal coth (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal acosh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal asinh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal atanh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal acoth (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal asech (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal acsch (const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - - friend const mpreal fac_ui (unsigned long int v, mp_prec_t prec, mp_rnd_t rnd_mode); - friend const mpreal eint (const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal gamma (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal tgamma (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal lngamma (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal lgamma (const mpreal& v, int *signp, mp_rnd_t rnd_mode); - friend const mpreal zeta (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal erf (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal erfc (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode); - friend const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode); - friend const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode); - friend const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode); - friend int sgn (const mpreal& v); - -// MPFR 2.4.0 Specifics -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - friend int sinh_cosh (mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal li2 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - friend const mpreal rec_sqrt (const mpreal& v, mp_rnd_t rnd_mode); - - // MATLAB's semantic equivalents - friend const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Remainder after division - friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division -#endif - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) - friend const mpreal digamma (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal ai (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode); // use gmp_randinit_default() to init state, gmp_randclear() to clear -#endif - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0)) - friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode); // use gmp_randinit_default() to init state, gmp_randclear() to clear - friend const mpreal grandom (unsigned int seed); -#endif - - // Uniformly distributed random number generation in [0,1] using - // Mersenne-Twister algorithm by default. - // Use parameter to setup seed, e.g.: random((unsigned)time(NULL)) - // Check urandom() for more precise control. - friend const mpreal random(unsigned int seed); - - // Splits mpreal value into fractional and integer parts. - // Returns fractional part and stores integer part in n. - friend const mpreal modf(const mpreal& v, mpreal& n); - - // Constants - // don't forget to call mpfr_free_cache() for every thread where you are using const-functions - friend const mpreal const_log2 (mp_prec_t prec, mp_rnd_t rnd_mode); - friend const mpreal const_pi (mp_prec_t prec, mp_rnd_t rnd_mode); - friend const mpreal const_euler (mp_prec_t prec, mp_rnd_t rnd_mode); - friend const mpreal const_catalan (mp_prec_t prec, mp_rnd_t rnd_mode); - - // returns +inf iff sign>=0 otherwise -inf - friend const mpreal const_infinity(int sign, mp_prec_t prec); - - // Output/ Input - friend std::ostream& operator<<(std::ostream& os, const mpreal& v); - friend std::istream& operator>>(std::istream& is, mpreal& v); - - // Integer Related Functions - friend const mpreal rint (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal ceil (const mpreal& v); - friend const mpreal floor(const mpreal& v); - friend const mpreal round(const mpreal& v); - friend const mpreal trunc(const mpreal& v); - friend const mpreal rint_ceil (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal rint_floor (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal rint_round (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal rint_trunc (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal frac (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal remainder ( const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - friend const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - - // Miscellaneous Functions - friend const mpreal nexttoward (const mpreal& x, const mpreal& y); - friend const mpreal nextabove (const mpreal& x); - friend const mpreal nextbelow (const mpreal& x); - - // use gmp_randinit_default() to init state, gmp_randclear() to clear - friend const mpreal urandomb (gmp_randstate_t& state); - -// MPFR < 2.4.2 Specifics -#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2)) - friend const mpreal random2 (mp_size_t size, mp_exp_t exp); -#endif - - // Instance Checkers - friend bool isnan EIGEN_NOT_A_MACRO (const mpreal& v); - friend bool (isinf) (const mpreal& v); - friend bool (isfinite) (const mpreal& v); - - friend bool isnum (const mpreal& v); - friend bool iszero (const mpreal& v); - friend bool isint (const mpreal& v); - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) - friend bool isregular(const mpreal& v); -#endif - - // Set/Get instance properties - inline mp_prec_t get_prec() const; - inline void set_prec(mp_prec_t prec, mp_rnd_t rnd_mode = get_default_rnd()); // Change precision with rounding mode - - // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex interface - inline mpreal& setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd()); - inline int getPrecision() const; - - // Set mpreal to +/- inf, NaN, +/-0 - mpreal& setInf (int Sign = +1); - mpreal& setNan (); - mpreal& setZero (int Sign = +1); - mpreal& setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd()); - - //Exponent - mp_exp_t get_exp() const; - int set_exp(mp_exp_t e); - int check_range (int t, mp_rnd_t rnd_mode = get_default_rnd()); - int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd()); - - // Inexact conversion from float - inline bool fits_in_bits(double x, int n); - - // Set/Get global properties - static void set_default_prec(mp_prec_t prec); - static void set_default_rnd(mp_rnd_t rnd_mode); - - static mp_exp_t get_emin (void); - static mp_exp_t get_emax (void); - static mp_exp_t get_emin_min (void); - static mp_exp_t get_emin_max (void); - static mp_exp_t get_emax_min (void); - static mp_exp_t get_emax_max (void); - static int set_emin (mp_exp_t exp); - static int set_emax (mp_exp_t exp); - - // Efficient swapping of two mpreal values - needed for std algorithms - friend void swap(mpreal& x, mpreal& y); - - friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - -private: - // Human friendly Debug Preview in Visual Studio. - // Put one of these lines: - // - // mpfr::mpreal= ; Show value only - // mpfr::mpreal=, bits ; Show value & precision - // - // at the beginning of - // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat - MPREAL_MSVC_DEBUGVIEW_DATA - - // "Smart" resources deallocation. Checks if instance initialized before deletion. - void clear(::mpfr_ptr); -}; - -////////////////////////////////////////////////////////////////////////// -// Exceptions -class conversion_overflow : public std::exception { -public: - std::string why() { return "inexact conversion from floating point"; } -}; - -////////////////////////////////////////////////////////////////////////// -// Constructors & converters -// Default constructor: creates mp number and initializes it to 0. -inline mpreal::mpreal() -{ - mpfr_init2(mpfr_ptr(), mpreal::get_default_prec()); - mpfr_set_zero_fast(mpfr_ptr()); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const mpreal& u) -{ - mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr())); - mpfr_set (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -#ifdef MPREAL_HAVE_MOVE_SUPPORT -inline mpreal::mpreal(mpreal&& other) -{ - mpfr_set_uninitialized(mpfr_ptr()); // make sure "other" holds null-pointer (in uninitialized state) - mpfr_swap(mpfr_ptr(), other.mpfr_ptr()); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal& mpreal::operator=(mpreal&& other) -{ - if (this != &other) - { - mpfr_swap(mpfr_ptr(), other.mpfr_ptr()); // destructor for "other" will be called just afterwards - MPREAL_MSVC_DEBUGVIEW_CODE; - } - return *this; -} -#endif - -inline mpreal::mpreal(const mpfr_t u, bool shared) -{ - if(shared) - { - std::memcpy(mpfr_ptr(), u, sizeof(mpfr_t)); - } - else - { - mpfr_init2(mpfr_ptr(), mpfr_get_prec(u)); - mpfr_set (mpfr_ptr(), u, mpreal::get_default_rnd()); - } - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const mpf_t u) -{ - mpfr_init2(mpfr_ptr(),(mp_prec_t) mpf_get_prec(u)); // (gmp: mp_bitcnt_t) unsigned long -> long (mpfr: mp_prec_t) - mpfr_set_f(mpfr_ptr(),u,mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const mpz_t u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2(mpfr_ptr(), prec); - mpfr_set_z(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const mpq_t u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2(mpfr_ptr(), prec); - mpfr_set_q(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2(mpfr_ptr(), prec); - -#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1) - if(fits_in_bits(u, MPREAL_DOUBLE_BITS_OVERFLOW)) - { - mpfr_set_d(mpfr_ptr(), u, mode); - }else - throw conversion_overflow(); -#else - mpfr_set_d(mpfr_ptr(), u, mode); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_ld(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_uj(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_sj(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_ui(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_ui(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_si(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_si(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_str(mpfr_ptr(), s, base, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline void mpreal::clear(::mpfr_ptr x) -{ -#ifdef MPREAL_HAVE_MOVE_SUPPORT - if(mpfr_is_initialized(x)) -#endif - mpfr_clear(x); -} - -inline mpreal::~mpreal() -{ - clear(mpfr_ptr()); -} - -// internal namespace needed for template magic -namespace internal{ - - // Use SFINAE to restrict arithmetic operations instantiation only for numeric types - // This is needed for smooth integration with libraries based on expression templates, like Eigen. - // TODO: Do the same for boolean operators. - template struct result_type {}; - - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; -} - -// + Addition -template -inline const typename internal::result_type::type - operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs; } - -template -inline const typename internal::result_type::type - operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs; } - -// - Subtraction -template -inline const typename internal::result_type::type - operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs; } - -template -inline const typename internal::result_type::type - operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs; } - -// * Multiplication -template -inline const typename internal::result_type::type - operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs; } - -template -inline const typename internal::result_type::type - operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs; } - -// / Division -template -inline const typename internal::result_type::type - operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs; } - -template -inline const typename internal::result_type::type - operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs; } - -////////////////////////////////////////////////////////////////////////// -// sqrt -const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal sqrt(const long int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal sqrt(const int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal sqrt(const long double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal sqrt(const double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -// abs -inline const mpreal abs(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()); - -////////////////////////////////////////////////////////////////////////// -// pow -const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -////////////////////////////////////////////////////////////////////////// -// Estimate machine epsilon for the given precision -// Returns smallest eps such that 1.0 + eps != 1.0 -inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec()); - -// Returns smallest eps such that x + eps != x (relative machine epsilon) -inline mpreal machine_epsilon(const mpreal& x); - -// Gives max & min values for the required precision, -// minval is 'safe' meaning 1 / minval does not overflow -// maxval is 'safe' meaning 1 / maxval does not underflow -inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec()); -inline mpreal maxval(mp_prec_t prec = mpreal::get_default_prec()); - -// 'Dirty' equality check 1: |a-b| < min{|a|,|b|} * eps -inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps); - -// 'Dirty' equality check 2: |a-b| < min{|a|,|b|} * eps( min{|a|,|b|} ) -inline bool isEqualFuzzy(const mpreal& a, const mpreal& b); - -// 'Bitwise' equality check -// maxUlps - a and b can be apart by maxUlps binary numbers. -inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps); - -////////////////////////////////////////////////////////////////////////// -// Convert precision in 'bits' to decimal digits and vice versa. -// bits = ceil(digits*log[2](10)) -// digits = floor(bits*log[10](2)) - -inline mp_prec_t digits2bits(int d); -inline int bits2digits(mp_prec_t b); - -////////////////////////////////////////////////////////////////////////// -// min, max -const mpreal (max)(const mpreal& x, const mpreal& y); -const mpreal (min)(const mpreal& x, const mpreal& y); - -////////////////////////////////////////////////////////////////////////// -// Implementation -////////////////////////////////////////////////////////////////////////// - -////////////////////////////////////////////////////////////////////////// -// Operators - Assignment -inline mpreal& mpreal::operator=(const mpreal& v) -{ - if (this != &v) - { - mp_prec_t tp = mpfr_get_prec( mpfr_srcptr()); - mp_prec_t vp = mpfr_get_prec(v.mpfr_srcptr()); - - if(tp != vp){ - clear(mpfr_ptr()); - mpfr_init2(mpfr_ptr(), vp); - } - - mpfr_set(mpfr_ptr(), v.mpfr_srcptr(), mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - } - return *this; -} - -inline mpreal& mpreal::operator=(const mpf_t v) -{ - mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const mpz_t v) -{ - mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const mpq_t v) -{ - mpfr_set_q(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const long double v) -{ - mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const double v) -{ -#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1) - if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW)) - { - mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd()); - }else - throw conversion_overflow(); -#else - mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd()); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const unsigned long int v) -{ - mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const unsigned int v) -{ - mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const unsigned long long int v) -{ - mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const long long int v) -{ - mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const long int v) -{ - mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const int v) -{ - mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const char* s) -{ - // Use other converters for more precise control on base & precision & rounding: - // - // mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode) - // mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode) - // - // Here we assume base = 10 and we use precision of target variable. - - mpfr_t t; - - mpfr_init2(t, mpfr_get_prec(mpfr_srcptr())); - - if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd())) - { - mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - } - - clear(t); - return *this; -} - -inline mpreal& mpreal::operator=(const std::string& s) -{ - // Use other converters for more precise control on base & precision & rounding: - // - // mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode) - // mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode) - // - // Here we assume base = 10 and we use precision of target variable. - - mpfr_t t; - - mpfr_init2(t, mpfr_get_prec(mpfr_srcptr())); - - if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd())) - { - mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - } - - clear(t); - return *this; -} - -template -inline mpreal& mpreal::operator= (const std::complex& z) -{ - return *this = z.real(); -} - -////////////////////////////////////////////////////////////////////////// -// + Addition -inline mpreal& mpreal::operator+=(const mpreal& v) -{ - mpfr_add(mpfr_ptr(), mpfr_srcptr(), v.mpfr_srcptr(), mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const mpf_t u) -{ - *this += mpreal(u); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const mpz_t u) -{ - mpfr_add_z(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const mpq_t u) -{ - mpfr_add_q(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+= (const long double u) -{ - *this += mpreal(u); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+= (const double u) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpfr_add_d(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); -#else - *this += mpreal(u); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const unsigned long int u) -{ - mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const unsigned int u) -{ - mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const long int u) -{ - mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const int u) -{ - mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const long long int u) { *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator+=(const unsigned long long int u){ *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator-=(const long long int u) { *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator-=(const unsigned long long int u){ *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator*=(const long long int u) { *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator*=(const unsigned long long int u){ *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator/=(const long long int u) { *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator/=(const unsigned long long int u){ *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } - -inline const mpreal mpreal::operator+()const { return mpreal(*this); } - -inline const mpreal operator+(const mpreal& a, const mpreal& b) -{ - mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr()))); - mpfr_add(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd()); - return c; -} - -inline mpreal& mpreal::operator++() -{ - return *this += 1; -} - -inline const mpreal mpreal::operator++ (int) -{ - mpreal x(*this); - *this += 1; - return x; -} - -inline mpreal& mpreal::operator--() -{ - return *this -= 1; -} - -inline const mpreal mpreal::operator-- (int) -{ - mpreal x(*this); - *this -= 1; - return x; -} - -////////////////////////////////////////////////////////////////////////// -// - Subtraction -inline mpreal& mpreal::operator-=(const mpreal& v) -{ - mpfr_sub(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const mpz_t v) -{ - mpfr_sub_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const mpq_t v) -{ - mpfr_sub_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const long double v) -{ - *this -= mpreal(v); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const double v) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); -#else - *this -= mpreal(v); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const unsigned long int v) -{ - mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const unsigned int v) -{ - mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const long int v) -{ - mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const int v) -{ - mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline const mpreal mpreal::operator-()const -{ - mpreal u(*this); - mpfr_neg(u.mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd()); - return u; -} - -inline const mpreal operator-(const mpreal& a, const mpreal& b) -{ - mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr()))); - mpfr_sub(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd()); - return c; -} - -inline const mpreal operator-(const double b, const mpreal& a) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_d_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -#else - mpreal x(b, mpfr_get_prec(a.mpfr_ptr())); - x -= a; - return x; -#endif -} - -inline const mpreal operator-(const unsigned long int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator-(const unsigned int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator-(const long int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator-(const int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -////////////////////////////////////////////////////////////////////////// -// * Multiplication -inline mpreal& mpreal::operator*= (const mpreal& v) -{ - mpfr_mul(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const mpz_t v) -{ - mpfr_mul_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const mpq_t v) -{ - mpfr_mul_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const long double v) -{ - *this *= mpreal(v); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const double v) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); -#else - *this *= mpreal(v); -#endif - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const unsigned long int v) -{ - mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const unsigned int v) -{ - mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const long int v) -{ - mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const int v) -{ - mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline const mpreal operator*(const mpreal& a, const mpreal& b) -{ - mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr()))); - mpfr_mul(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd()); - return c; -} - -////////////////////////////////////////////////////////////////////////// -// / Division -inline mpreal& mpreal::operator/=(const mpreal& v) -{ - mpfr_div(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const mpz_t v) -{ - mpfr_div_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const mpq_t v) -{ - mpfr_div_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const long double v) -{ - *this /= mpreal(v); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const double v) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); -#else - *this /= mpreal(v); -#endif - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const unsigned long int v) -{ - mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const unsigned int v) -{ - mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const long int v) -{ - mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const int v) -{ - mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline const mpreal operator/(const mpreal& a, const mpreal& b) -{ - mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_srcptr()), mpfr_get_prec(b.mpfr_srcptr()))); - mpfr_div(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd()); - return c; -} - -inline const mpreal operator/(const unsigned long int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator/(const unsigned int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator/(const long int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator/(const int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator/(const double b, const mpreal& a) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_d_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -#else - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - x /= a; - return x; -#endif -} - -////////////////////////////////////////////////////////////////////////// -// Shifts operators - Multiplication/Division by power of 2 -inline mpreal& mpreal::operator<<=(const unsigned long int u) -{ - mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator<<=(const unsigned int u) -{ - mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast(u),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator<<=(const long int u) -{ - mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator<<=(const int u) -{ - mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),static_cast(u),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator>>=(const unsigned long int u) -{ - mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator>>=(const unsigned int u) -{ - mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast(u),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator>>=(const long int u) -{ - mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator>>=(const int u) -{ - mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),static_cast(u),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline const mpreal operator<<(const mpreal& v, const unsigned long int k) -{ - return mul_2ui(v,k); -} - -inline const mpreal operator<<(const mpreal& v, const unsigned int k) -{ - return mul_2ui(v,static_cast(k)); -} - -inline const mpreal operator<<(const mpreal& v, const long int k) -{ - return mul_2si(v,k); -} - -inline const mpreal operator<<(const mpreal& v, const int k) -{ - return mul_2si(v,static_cast(k)); -} - -inline const mpreal operator>>(const mpreal& v, const unsigned long int k) -{ - return div_2ui(v,k); -} - -inline const mpreal operator>>(const mpreal& v, const long int k) -{ - return div_2si(v,k); -} - -inline const mpreal operator>>(const mpreal& v, const unsigned int k) -{ - return div_2ui(v,static_cast(k)); -} - -inline const mpreal operator>>(const mpreal& v, const int k) -{ - return div_2si(v,static_cast(k)); -} - -// mul_2ui -inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode) -{ - mpreal x(v); - mpfr_mul_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode); - return x; -} - -// mul_2si -inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode) -{ - mpreal x(v); - mpfr_mul_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode); - return x; -} - -inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode) -{ - mpreal x(v); - mpfr_div_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode); - return x; -} - -inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode) -{ - mpreal x(v); - mpfr_div_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode); - return x; -} - -////////////////////////////////////////////////////////////////////////// -//Relational operators - -// WARNING: -// -// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode: -// -// isnan(b) = (b != b) -// isnan(b) = !(b == b) (we use in code below) -// -// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC). -// Use std::isnan instead (C++11). - -inline bool operator > (const mpreal& a, const mpreal& b ){ return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator > (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 ); } - -inline bool operator >= (const mpreal& a, const mpreal& b ){ return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator >= (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 ); } - -inline bool operator < (const mpreal& a, const mpreal& b ){ return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator < (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 ); } - -inline bool operator <= (const mpreal& a, const mpreal& b ){ return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator <= (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 ); } - -inline bool operator == (const mpreal& a, const mpreal& b ){ return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator == (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 ); } - -inline bool operator != (const mpreal& a, const mpreal& b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const unsigned long int b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const unsigned int b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const long int b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const int b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const long double b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const double b ){ return !(a == b); } - -inline bool isnan EIGEN_NOT_A_MACRO (const mpreal& op){ return (mpfr_nan_p (op.mpfr_srcptr()) != 0 ); } -inline bool (isinf) (const mpreal& op){ return (mpfr_inf_p (op.mpfr_srcptr()) != 0 ); } -inline bool (isfinite) (const mpreal& op){ return (mpfr_number_p (op.mpfr_srcptr()) != 0 ); } -inline bool iszero (const mpreal& op){ return (mpfr_zero_p (op.mpfr_srcptr()) != 0 ); } -inline bool isint (const mpreal& op){ return (mpfr_integer_p(op.mpfr_srcptr()) != 0 ); } - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) -inline bool isregular(const mpreal& op){ return (mpfr_regular_p(op.mpfr_srcptr()));} -#endif - -////////////////////////////////////////////////////////////////////////// -// Type Converters -inline bool mpreal::toBool ( ) const { return mpfr_zero_p (mpfr_srcptr()) == 0; } -inline long mpreal::toLong (mp_rnd_t mode) const { return mpfr_get_si (mpfr_srcptr(), mode); } -inline unsigned long mpreal::toULong (mp_rnd_t mode) const { return mpfr_get_ui (mpfr_srcptr(), mode); } -inline float mpreal::toFloat (mp_rnd_t mode) const { return mpfr_get_flt(mpfr_srcptr(), mode); } -inline double mpreal::toDouble (mp_rnd_t mode) const { return mpfr_get_d (mpfr_srcptr(), mode); } -inline long double mpreal::toLDouble(mp_rnd_t mode) const { return mpfr_get_ld (mpfr_srcptr(), mode); } -inline long long mpreal::toLLong (mp_rnd_t mode) const { return mpfr_get_sj (mpfr_srcptr(), mode); } -inline unsigned long long mpreal::toULLong (mp_rnd_t mode) const { return mpfr_get_uj (mpfr_srcptr(), mode); } - -inline ::mpfr_ptr mpreal::mpfr_ptr() { return mp; } -inline ::mpfr_srcptr mpreal::mpfr_ptr() const { return mp; } -inline ::mpfr_srcptr mpreal::mpfr_srcptr() const { return mp; } - -template -inline std::string toString(T t, std::ios_base & (*f)(std::ios_base&)) -{ - std::ostringstream oss; - oss << f << t; - return oss.str(); -} - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - -inline std::string mpreal::toString(const std::string& format) const -{ - char *s = NULL; - std::string out; - - if( !format.empty() ) - { - if(!(mpfr_asprintf(&s, format.c_str(), mpfr_srcptr()) < 0)) - { - out = std::string(s); - - mpfr_free_str(s); - } - } - - return out; -} - -#endif - -inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const -{ - // TODO: Add extended format specification (f, e, rounding mode) as it done in output operator - (void)b; - (void)mode; - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - - std::ostringstream format; - - int digits = (n >= 0) ? n : 2 + bits2digits(mpfr_get_prec(mpfr_srcptr())); - - format << "%." << digits << "RNg"; - - return toString(format.str()); - -#else - - char *s, *ns = NULL; - size_t slen, nslen; - mp_exp_t exp; - std::string out; - - if(mpfr_inf_p(mp)) - { - if(mpfr_sgn(mp)>0) return "+Inf"; - else return "-Inf"; - } - - if(mpfr_zero_p(mp)) return "0"; - if(mpfr_nan_p(mp)) return "NaN"; - - s = mpfr_get_str(NULL, &exp, b, 0, mp, mode); - ns = mpfr_get_str(NULL, &exp, b, (std::max)(0,n), mp, mode); - - if(s!=NULL && ns!=NULL) - { - slen = strlen(s); - nslen = strlen(ns); - if(nslen<=slen) - { - mpfr_free_str(s); - s = ns; - slen = nslen; - } - else { - mpfr_free_str(ns); - } - - // Make human eye-friendly formatting if possible - if (exp>0 && static_cast(exp)s+exp) ptr--; - - if(ptr==s+exp) out = std::string(s,exp+1); - else out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1); - - //out = string(s,exp+1)+'.'+string(s+exp+1); - } - else - { - // Remove zeros starting from right end - char* ptr = s+slen-1; - while (*ptr=='0' && ptr>s+exp-1) ptr--; - - if(ptr==s+exp-1) out = std::string(s,exp); - else out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1); - - //out = string(s,exp)+'.'+string(s+exp); - } - - }else{ // exp<0 || exp>slen - if(s[0]=='-') - { - // Remove zeros starting from right end - char* ptr = s+slen-1; - while (*ptr=='0' && ptr>s+1) ptr--; - - if(ptr==s+1) out = std::string(s,2); - else out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1); - - //out = string(s,2)+'.'+string(s+2); - } - else - { - // Remove zeros starting from right end - char* ptr = s+slen-1; - while (*ptr=='0' && ptr>s) ptr--; - - if(ptr==s) out = std::string(s,1); - else out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1); - - //out = string(s,1)+'.'+string(s+1); - } - - // Make final string - if(--exp) - { - if(exp>0) out += "e+"+mpfr::toString(exp,std::dec); - else out += "e"+mpfr::toString(exp,std::dec); - } - } - - mpfr_free_str(s); - return out; - }else{ - return "conversion error!"; - } -#endif -} - - -////////////////////////////////////////////////////////////////////////// -// I/O -inline std::ostream& mpreal::output(std::ostream& os) const -{ - std::ostringstream format; - const std::ios::fmtflags flags = os.flags(); - - format << ((flags & std::ios::showpos) ? "%+" : "%"); - if (os.precision() >= 0) - format << '.' << os.precision() << "R*" - << ((flags & std::ios::floatfield) == std::ios::fixed ? 'f' : - (flags & std::ios::floatfield) == std::ios::scientific ? 'e' : - 'g'); - else - format << "R*e"; - - char *s = NULL; - if(!(mpfr_asprintf(&s, format.str().c_str(), - mpfr::mpreal::get_default_rnd(), - mpfr_srcptr()) - < 0)) - { - os << std::string(s); - mpfr_free_str(s); - } - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const mpreal& v) -{ - return v.output(os); -} - -inline std::istream& operator>>(std::istream &is, mpreal& v) -{ - // TODO: use cout::hexfloat and other flags to setup base - std::string tmp; - is >> tmp; - mpfr_set_str(v.mpfr_ptr(), tmp.c_str(), 10, mpreal::get_default_rnd()); - return is; -} - -////////////////////////////////////////////////////////////////////////// -// Bits - decimal digits relation -// bits = ceil(digits*log[2](10)) -// digits = floor(bits*log[10](2)) - -inline mp_prec_t digits2bits(int d) -{ - const double LOG2_10 = 3.3219280948873624; - - return mp_prec_t(std::ceil( d * LOG2_10 )); -} - -inline int bits2digits(mp_prec_t b) -{ - const double LOG10_2 = 0.30102999566398119; - - return int(std::floor( b * LOG10_2 )); -} - -////////////////////////////////////////////////////////////////////////// -// Set/Get number properties -inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode) -{ - mpfr_setsign(mpfr_ptr(), mpfr_srcptr(), sign < 0, RoundingMode); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline int mpreal::getPrecision() const -{ - return int(mpfr_get_prec(mpfr_srcptr())); -} - -inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode) -{ - mpfr_prec_round(mpfr_ptr(), Precision, RoundingMode); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::setInf(int sign) -{ - mpfr_set_inf(mpfr_ptr(), sign); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::setNan() -{ - mpfr_set_nan(mpfr_ptr()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::setZero(int sign) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) - mpfr_set_zero(mpfr_ptr(), sign); -#else - mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)()); - setSign(sign); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mp_prec_t mpreal::get_prec() const -{ - return mpfr_get_prec(mpfr_srcptr()); -} - -inline void mpreal::set_prec(mp_prec_t prec, mp_rnd_t rnd_mode) -{ - mpfr_prec_round(mpfr_ptr(),prec,rnd_mode); - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mp_exp_t mpreal::get_exp () const -{ - return mpfr_get_exp(mpfr_srcptr()); -} - -inline int mpreal::set_exp (mp_exp_t e) -{ - int x = mpfr_set_exp(mpfr_ptr(), e); - MPREAL_MSVC_DEBUGVIEW_CODE; - return x; -} - -inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd()) -{ - mpreal y(x); -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0)) - mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode); -#else - *exp = mpfr_get_exp(y.mpfr_srcptr()); - mpfr_set_exp(y.mpfr_ptr(),0); -#endif - return y; -} - -inline const mpreal ldexp(const mpreal& v, mp_exp_t exp) -{ - mpreal x(v); - - // rounding is not important since we are just increasing the exponent (= exact operation) - mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd()); - return x; -} - -inline const mpreal scalbn(const mpreal& v, mp_exp_t exp) -{ - return ldexp(v, exp); -} - -inline mpreal machine_epsilon(mp_prec_t prec) -{ - /* the smallest eps such that 1 + eps != 1 */ - return machine_epsilon(mpreal(1, prec)); -} - -inline mpreal machine_epsilon(const mpreal& x) -{ - /* the smallest eps such that x + eps != x */ - if( x < 0) - { - return nextabove(-x) + x; - }else{ - return nextabove( x) - x; - } -} - -// minval is 'safe' meaning 1 / minval does not overflow -inline mpreal minval(mp_prec_t prec) -{ - /* min = 1/2 * 2^emin = 2^(emin - 1) */ - return mpreal(1, prec) << mpreal::get_emin()-1; -} - -// maxval is 'safe' meaning 1 / maxval does not underflow -inline mpreal maxval(mp_prec_t prec) -{ - /* max = (1 - eps) * 2^emax, eps is machine epsilon */ - return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax(); -} - -inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps) -{ - return abs(a - b) <= machine_epsilon((max)(abs(a), abs(b))) * maxUlps; -} - -inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps) -{ - return abs(a - b) <= eps; -} - -inline bool isEqualFuzzy(const mpreal& a, const mpreal& b) -{ - return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b))))); -} - -////////////////////////////////////////////////////////////////////////// -// C++11 sign functions. -inline mpreal copysign(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal rop(0, mpfr_get_prec(x.mpfr_ptr())); - mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode); - return rop; -} - -inline bool signbit(const mpreal& x) -{ - return mpfr_signbit(x.mpfr_srcptr()); -} - -inline mpreal& setsignbit(mpreal& x, bool minus, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpfr_setsign(x.mpfr_ptr(), x.mpfr_srcptr(), minus, rnd_mode); - return x; -} - -inline const mpreal modf(const mpreal& v, mpreal& n) -{ - mpreal f(v); - - // rounding is not important since we are using the same number - mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd()); - mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr()); - return f; -} - -inline int mpreal::check_range (int t, mp_rnd_t rnd_mode) -{ - return mpfr_check_range(mpfr_ptr(),t,rnd_mode); -} - -inline int mpreal::subnormalize (int t,mp_rnd_t rnd_mode) -{ - int r = mpfr_subnormalize(mpfr_ptr(),t,rnd_mode); - MPREAL_MSVC_DEBUGVIEW_CODE; - return r; -} - -inline mp_exp_t mpreal::get_emin (void) -{ - return mpfr_get_emin(); -} - -inline int mpreal::set_emin (mp_exp_t exp) -{ - return mpfr_set_emin(exp); -} - -inline mp_exp_t mpreal::get_emax (void) -{ - return mpfr_get_emax(); -} - -inline int mpreal::set_emax (mp_exp_t exp) -{ - return mpfr_set_emax(exp); -} - -inline mp_exp_t mpreal::get_emin_min (void) -{ - return mpfr_get_emin_min(); -} - -inline mp_exp_t mpreal::get_emin_max (void) -{ - return mpfr_get_emin_max(); -} - -inline mp_exp_t mpreal::get_emax_min (void) -{ - return mpfr_get_emax_min(); -} - -inline mp_exp_t mpreal::get_emax_max (void) -{ - return mpfr_get_emax_max(); -} - -////////////////////////////////////////////////////////////////////////// -// Mathematical Functions -////////////////////////////////////////////////////////////////////////// -#define MPREAL_UNARY_MATH_FUNCTION_BODY(f) \ - mpreal y(0, mpfr_get_prec(x.mpfr_srcptr())); \ - mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r); \ - return y; - -inline const mpreal sqr (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ MPREAL_UNARY_MATH_FUNCTION_BODY(sqr ); } - -inline const mpreal sqrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ MPREAL_UNARY_MATH_FUNCTION_BODY(sqrt); } - -inline const mpreal sqrt(const unsigned long int x, mp_rnd_t r) -{ - mpreal y; - mpfr_sqrt_ui(y.mpfr_ptr(), x, r); - return y; -} - -inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode) -{ - return sqrt(static_cast(v),rnd_mode); -} - -inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode) -{ - if (v>=0) return sqrt(static_cast(v),rnd_mode); - else return mpreal().setNan(); // NaN -} - -inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode) -{ - if (v>=0) return sqrt(static_cast(v),rnd_mode); - else return mpreal().setNan(); // NaN -} - -inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, mpfr_get_prec(x.mpfr_srcptr())); - #if (MPFR_VERSION >= MPFR_VERSION_NUM(4,0,0)) - mpfr_rootn_ui(y.mpfr_ptr(), x.mpfr_srcptr(), k, r); - #else - mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r); - #endif - return y; -} - -inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_dim(y.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), r); - return y; -} - -inline int cmpabs(const mpreal& a,const mpreal& b) -{ - return mpfr_cmpabs(a.mpfr_ptr(), b.mpfr_srcptr()); -} - -inline int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - return mpfr_sin_cos(s.mpfr_ptr(), c.mpfr_ptr(), v.mpfr_srcptr(), rnd_mode); -} - -inline const mpreal sqrt (const long double v, mp_rnd_t rnd_mode) { return sqrt(mpreal(v),rnd_mode); } -inline const mpreal sqrt (const double v, mp_rnd_t rnd_mode) { return sqrt(mpreal(v),rnd_mode); } - -inline const mpreal cbrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cbrt ); } -inline const mpreal fabs (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(abs ); } -inline const mpreal abs (const mpreal& x, mp_rnd_t r) { MPREAL_UNARY_MATH_FUNCTION_BODY(abs ); } -inline const mpreal log (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log ); } -inline const mpreal log2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log2 ); } -inline const mpreal log10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log10); } -inline const mpreal exp (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp ); } -inline const mpreal exp2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp2 ); } -inline const mpreal exp10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp10); } -inline const mpreal cos (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cos ); } -inline const mpreal sin (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sin ); } -inline const mpreal tan (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(tan ); } -inline const mpreal sec (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sec ); } -inline const mpreal csc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(csc ); } -inline const mpreal cot (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cot ); } -inline const mpreal acos (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(acos ); } -inline const mpreal asin (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(asin ); } -inline const mpreal atan (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(atan ); } - -inline const mpreal logb (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { return log2 (abs(x),r); } - -inline const mpreal acot (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return atan (1/v, r); } -inline const mpreal asec (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return acos (1/v, r); } -inline const mpreal acsc (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return asin (1/v, r); } -inline const mpreal acoth (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return atanh(1/v, r); } -inline const mpreal asech (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return acosh(1/v, r); } -inline const mpreal acsch (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return asinh(1/v, r); } - -inline const mpreal cosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cosh ); } -inline const mpreal sinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sinh ); } -inline const mpreal tanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(tanh ); } -inline const mpreal sech (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sech ); } -inline const mpreal csch (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(csch ); } -inline const mpreal coth (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(coth ); } -inline const mpreal acosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(acosh); } -inline const mpreal asinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(asinh); } -inline const mpreal atanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(atanh); } - -inline const mpreal log1p (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log1p ); } -inline const mpreal expm1 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(expm1 ); } -inline const mpreal eint (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(eint ); } -inline const mpreal gamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(gamma ); } -inline const mpreal tgamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(gamma ); } -inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma); } -inline const mpreal zeta (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(zeta ); } -inline const mpreal erf (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(erf ); } -inline const mpreal erfc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(erfc ); } -inline const mpreal besselj0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(j0 ); } -inline const mpreal besselj1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(j1 ); } -inline const mpreal bessely0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(y0 ); } -inline const mpreal bessely1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(y1 ); } - -inline const mpreal nextpow2(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, x.getPrecision()); - - if(!iszero(x)) - y = ceil(log2(abs(x,r),r)); - - return y; -} - -inline const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision())); - mpfr_atan2(a.mpfr_ptr(), y.mpfr_srcptr(), x.mpfr_srcptr(), rnd_mode); - return a; -} - -inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision())); - mpfr_hypot(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode); - return a; -} - -inline const mpreal hypot(const mpreal& a, const mpreal& b, const mpreal& c) -{ - if(isnan EIGEN_NOT_A_MACRO (a) || isnan EIGEN_NOT_A_MACRO (b) || isnan EIGEN_NOT_A_MACRO(c)) return mpreal().setNan(); - else - { - mpreal absa = abs(a), absb = abs(b), absc = abs(c); - mpreal w = (std::max)(absa, (std::max)(absb, absc)); - mpreal r; - - if (!iszero(w)) - { - mpreal iw = 1/w; - r = w * sqrt(sqr(absa*iw) + sqr(absb*iw) + sqr(absc*iw)); - } - - return r; - } -} - -inline const mpreal hypot(const mpreal& a, const mpreal& b, const mpreal& c, const mpreal& d) -{ - if(isnan EIGEN_NOT_A_MACRO (a) || isnan EIGEN_NOT_A_MACRO (b) || isnan EIGEN_NOT_A_MACRO (c) || isnan EIGEN_NOT_A_MACRO (d)) return mpreal().setNan(); - else - { - mpreal absa = abs(a), absb = abs(b), absc = abs(c), absd = abs(d); - mpreal w = (std::max)(absa, (std::max)(absb, (std::max)(absc, absd))); - mpreal r; - - if (!iszero(w)) - { - mpreal iw = 1/w; - r = w * sqrt(sqr(absa*iw) + sqr(absb*iw) + sqr(absc*iw) + sqr(absd*iw)); - } - - return r; - } -} - -inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision())); - mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode); - return a; -} - -inline const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision())); - mpfr_remquo(a.mpfr_ptr(),q, x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode); - return a; -} - -inline const mpreal fac_ui (unsigned long int v, mp_prec_t prec = mpreal::get_default_prec(), - mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(0, prec); - mpfr_fac_ui(x.mpfr_ptr(),v,rnd_mode); - return x; -} - - -inline const mpreal lgamma (const mpreal& v, int *signp = 0, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(v); - int tsignp; - - if(signp) mpfr_lgamma(x.mpfr_ptr(), signp,v.mpfr_srcptr(),rnd_mode); - else mpfr_lgamma(x.mpfr_ptr(),&tsignp,v.mpfr_srcptr(),rnd_mode); - - return x; -} - - -inline const mpreal besseljn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, x.getPrecision()); - mpfr_jn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r); - return y; -} - -inline const mpreal besselyn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, x.getPrecision()); - mpfr_yn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r); - return y; -} - -inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a; - mp_prec_t p1, p2, p3; - - p1 = v1.get_prec(); - p2 = v2.get_prec(); - p3 = v3.get_prec(); - - a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1)); - - mpfr_fma(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode); - return a; -} - -inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a; - mp_prec_t p1, p2, p3; - - p1 = v1.get_prec(); - p2 = v2.get_prec(); - p3 = v3.get_prec(); - - a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1)); - - mpfr_fms(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode); - return a; -} - -inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a; - mp_prec_t p1, p2; - - p1 = v1.get_prec(); - p2 = v2.get_prec(); - - a.set_prec(p1>p2?p1:p2); - - mpfr_agm(a.mp, v1.mp, v2.mp, rnd_mode); - - return a; -} - -inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd()) -{ - mpfr_srcptr *p = new mpfr_srcptr[n]; - - for (unsigned long int i = 0; i < n; i++) - p[i] = tab[i].mpfr_srcptr(); - - mpreal x; - status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode); - - delete [] p; - return x; -} - -////////////////////////////////////////////////////////////////////////// -// MPFR 2.4.0 Specifics -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - -inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode); -} - -inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ - MPREAL_UNARY_MATH_FUNCTION_BODY(li2); -} - -inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - /* R = rem(X,Y) if Y != 0, returns X - n * Y where n = trunc(X/Y). */ - return fmod(x, y, rnd_mode); -} - -inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - (void)rnd_mode; - - /* - - m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y) - - The following are true by convention: - - mod(x,0) is x - - mod(x,x) is 0 - - mod(x,y) for x != y and y != 0 has the same sign as y. - - */ - - if(iszero(y)) return x; - if(x == y) return 0; - - mpreal m = x - floor(x / y) * y; - - return copysign(abs(m),y); // make sure result has the same sign as Y -} - -inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a; - mp_prec_t yp, xp; - - yp = y.get_prec(); - xp = x.get_prec(); - - a.set_prec(yp>xp?yp:xp); - - mpfr_fmod(a.mp, x.mp, y.mp, rnd_mode); - - return a; -} - -inline const mpreal rec_sqrt(const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(v); - mpfr_rec_sqrt(x.mp,v.mp,rnd_mode); - return x; -} -#endif // MPFR 2.4.0 Specifics - -////////////////////////////////////////////////////////////////////////// -// MPFR 3.0.0 Specifics -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) -inline const mpreal digamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(digamma); } -inline const mpreal ai (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(ai); } -#endif // MPFR 3.0.0 Specifics - -////////////////////////////////////////////////////////////////////////// -// Constants -inline const mpreal const_log2 (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal x(0, p); - mpfr_const_log2(x.mpfr_ptr(), r); - return x; -} - -inline const mpreal const_pi (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal x(0, p); - mpfr_const_pi(x.mpfr_ptr(), r); - return x; -} - -inline const mpreal const_euler (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal x(0, p); - mpfr_const_euler(x.mpfr_ptr(), r); - return x; -} - -inline const mpreal const_catalan (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal x(0, p); - mpfr_const_catalan(x.mpfr_ptr(), r); - return x; -} - -inline const mpreal const_infinity (int sign = 1, mp_prec_t p = mpreal::get_default_prec()) -{ - mpreal x(0, p); - mpfr_set_inf(x.mpfr_ptr(), sign); - return x; -} - -////////////////////////////////////////////////////////////////////////// -// Integer Related Functions -inline const mpreal ceil(const mpreal& v) -{ - mpreal x(v); - mpfr_ceil(x.mp,v.mp); - return x; -} - -inline const mpreal floor(const mpreal& v) -{ - mpreal x(v); - mpfr_floor(x.mp,v.mp); - return x; -} - -inline const mpreal round(const mpreal& v) -{ - mpreal x(v); - mpfr_round(x.mp,v.mp); - return x; -} - -inline const mpreal trunc(const mpreal& v) -{ - mpreal x(v); - mpfr_trunc(x.mp,v.mp); - return x; -} - -inline const mpreal rint (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint ); } -inline const mpreal rint_ceil (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_ceil ); } -inline const mpreal rint_floor (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_floor); } -inline const mpreal rint_round (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_round); } -inline const mpreal rint_trunc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_trunc); } -inline const mpreal frac (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(frac ); } - -////////////////////////////////////////////////////////////////////////// -// Miscellaneous Functions -inline int sgn(const mpreal& op) -{ - // Please note, this is classic signum function which ignores sign of zero. - // Use signbit if you need sign of zero. - return mpfr_sgn(op.mpfr_srcptr()); -} - -////////////////////////////////////////////////////////////////////////// -// Miscellaneous Functions -inline void swap (mpreal& a, mpreal& b) { mpfr_swap(a.mpfr_ptr(),b.mpfr_ptr()); } -inline const mpreal (max)(const mpreal& x, const mpreal& y){ return (x>y?x:y); } -inline const mpreal (min)(const mpreal& x, const mpreal& y){ return (x= MPFR_VERSION_NUM(3,0,0)) -inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x; - mpfr_urandom(x.mpfr_ptr(), state, rnd_mode); - return x; -} -#endif - -#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2)) -inline const mpreal random2 (mp_size_t size, mp_exp_t exp) -{ - mpreal x; - mpfr_random2(x.mpfr_ptr(),size,exp); - return x; -} -#endif - -// Uniformly distributed random number generation -// a = random(seed); <- initialization & first random number generation -// a = random(); <- next random numbers generation -// seed != 0 -inline const mpreal random(unsigned int seed = 0) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) - static gmp_randstate_t state; - static bool initialize = true; - - if(initialize) - { - gmp_randinit_default(state); - gmp_randseed_ui(state,0); - initialize = false; - } - - if(seed != 0) gmp_randseed_ui(state,seed); - - return mpfr::urandom(state); -#else - if(seed != 0) std::srand(seed); - return mpfr::mpreal(std::rand()/(double)RAND_MAX); -#endif - -} - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0) && MPFR_VERSION < MPFR_VERSION_NUM(4,0,0)) - -// TODO: -// Use mpfr_nrandom since mpfr_grandom is deprecated -#if defined(_MSC_VER) -#pragma warning( push ) -#pragma warning( disable : 1478) -#endif -inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x; - mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode); - return x; -} -#if defined(_MSC_VER) -#pragma warning( pop ) -#endif - -inline const mpreal grandom(unsigned int seed = 0) -{ - static gmp_randstate_t state; - static bool initialize = true; - - if(initialize) - { - gmp_randinit_default(state); - gmp_randseed_ui(state,0); - initialize = false; - } - - if(seed != 0) gmp_randseed_ui(state,seed); - - return mpfr::grandom(state); -} -#endif - -////////////////////////////////////////////////////////////////////////// -// Set/Get global properties -inline void mpreal::set_default_prec(mp_prec_t prec) -{ - mpfr_set_default_prec(prec); -} - -inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode) -{ - mpfr_set_default_rounding_mode(rnd_mode); -} - -inline bool mpreal::fits_in_bits(double x, int n) -{ - int i; - double t; - return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0); -} - -inline const mpreal pow(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_pow(x.mp,x.mp,b.mp,rnd_mode); - return x; -} - -inline const mpreal pow(const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_pow_z(x.mp,x.mp,b,rnd_mode); - return x; -} - -inline const mpreal pow(const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_pow_ui(x.mp,x.mp,b,rnd_mode); - return x; -} - -inline const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(a,static_cast(b),rnd_mode); -} - -inline const mpreal pow(const mpreal& a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_pow_si(x.mp,x.mp,b,rnd_mode); - return x; -} - -inline const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode) -{ - return pow(a,static_cast(b),rnd_mode); -} - -inline const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode) -{ - return pow(a,mpreal(b),rnd_mode); -} - -inline const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode) -{ - return pow(a,mpreal(b),rnd_mode); -} - -inline const mpreal pow(const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_ui_pow(x.mp,a,b.mp,rnd_mode); - return x; -} - -inline const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),b,rnd_mode); -} - -inline const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),b,rnd_mode); - else return pow(mpreal(a),b,rnd_mode); -} - -inline const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),b,rnd_mode); - else return pow(mpreal(a),b,rnd_mode); -} - -inline const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); -} - -inline const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); -} - -// pow unsigned long int -inline const mpreal pow(const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - mpreal x(a); - mpfr_ui_pow_ui(x.mp,a,b,rnd_mode); - return x; -} - -inline const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(a,static_cast(b),rnd_mode); //mpfr_ui_pow_ui -} - -inline const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode) -{ - if(b>0) return pow(a,static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode) -{ - if(b>0) return pow(a,static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode) -{ - return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode) -{ - return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow -} - -// pow unsigned int -inline const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),b,rnd_mode); //mpfr_ui_pow_ui -} - -inline const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui -} - -inline const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode) -{ - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode) -{ - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow -} - -// pow long int -inline const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - if (a>0) return pow(static_cast(a),b,rnd_mode); //mpfr_ui_pow_ui - else return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode) -{ - if (a>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(mpreal(a),static_cast(b),rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode) -{ - if (a>0) - { - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - }else{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si - } -} - -inline const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode) -{ - if (a>0) - { - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - }else{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_si - } -} - -inline const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow -} - -inline const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow -} - -// pow int -inline const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - if (a>0) return pow(static_cast(a),b,rnd_mode); //mpfr_ui_pow_ui - else return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode) -{ - if (a>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(mpreal(a),static_cast(b),rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode) -{ - if (a>0) - { - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - }else{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si - } -} - -inline const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode) -{ - if (a>0) - { - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - }else{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_si - } -} - -inline const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow -} - -inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow -} - -// pow long double -inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),mpreal(b),rnd_mode); -} - -inline const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),static_cast(b),rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si -} - -inline const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_si -} - -inline const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),mpreal(b),rnd_mode); -} - -inline const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_ui -} - -inline const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_ui -} - -inline const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si -} - -inline const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_si -} -} // End of mpfr namespace - -// Explicit specialization of std::swap for mpreal numbers -// Thus standard algorithms will use efficient version of swap (due to Koenig lookup) -// Non-throwing swap C++ idiom: http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Non-throwing_swap -namespace std -{ - // we are allowed to extend namespace std with specializations only - template <> - inline void swap(mpfr::mpreal& x, mpfr::mpreal& y) - { - return mpfr::swap(x, y); - } - - template<> - class numeric_limits - { - public: - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const int radix = 2; - - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - - static const bool is_iec559 = true; // = IEEE 754 - static const bool is_bounded = true; - static const bool is_modulo = false; - static const bool traps = true; - static const bool tinyness_before = true; - - static const float_denorm_style has_denorm = denorm_absent; - - inline static mpfr::mpreal (min) (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::minval(precision); } - inline static mpfr::mpreal (max) (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(precision); } - inline static mpfr::mpreal lowest (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(precision); } - - // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon) - inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(precision); } - - // Returns smallest eps such that x + eps != x (relative machine epsilon) - inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) { return mpfr::machine_epsilon(x); } - - inline static mpfr::mpreal round_error(mp_prec_t precision = mpfr::mpreal::get_default_prec()) - { - mp_rnd_t r = mpfr::mpreal::get_default_rnd(); - - if(r == GMP_RNDN) return mpfr::mpreal(0.5, precision); - else return mpfr::mpreal(1.0, precision); - } - - inline static const mpfr::mpreal infinity() { return mpfr::const_infinity(); } - inline static const mpfr::mpreal quiet_NaN() { return mpfr::mpreal().setNan(); } - inline static const mpfr::mpreal signaling_NaN() { return mpfr::mpreal().setNan(); } - inline static const mpfr::mpreal denorm_min() { return (min)(); } - - // Please note, exponent range is not fixed in MPFR - static const int min_exponent = MPFR_EMIN_DEFAULT; - static const int max_exponent = MPFR_EMAX_DEFAULT; - MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811); - MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811); - -#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS - - // Following members should be constant according to standard, but they can be variable in MPFR - // So we define them as functions here. - // - // This is preferable way for std::numeric_limits specialization. - // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost. - // See below for compatible implementation. - inline static float_round_style round_style() - { - mp_rnd_t r = mpfr::mpreal::get_default_rnd(); - - switch (r) - { - case GMP_RNDN: return round_to_nearest; - case GMP_RNDZ: return round_toward_zero; - case GMP_RNDU: return round_toward_infinity; - case GMP_RNDD: return round_toward_neg_infinity; - default: return round_indeterminate; - } - } - - inline static int digits() { return int(mpfr::mpreal::get_default_prec()); } - inline static int digits(const mpfr::mpreal& x) { return x.getPrecision(); } - - inline static int digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec()) - { - return mpfr::bits2digits(precision); - } - - inline static int digits10(const mpfr::mpreal& x) - { - return mpfr::bits2digits(x.getPrecision()); - } - - inline static int max_digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec()) - { - return digits10(precision); - } -#else - // Digits and round_style are NOT constants when it comes to mpreal. - // If possible, please use functions digits() and round_style() defined above. - // - // These (default) values are preserved for compatibility with existing libraries, e.g. boost. - // Change them accordingly to your application. - // - // For example, if you use 256 bits of precision uniformly in your program, then: - // digits = 256 - // digits10 = 77 - // max_digits10 = 78 - // - // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details. - - static const std::float_round_style round_style = round_to_nearest; - static const int digits = 53; - static const int digits10 = 15; - static const int max_digits10 = 16; -#endif - }; - -} - -#endif /* __MPREAL_H__ */ diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp index 4a25e993c..10beb0714 100644 --- a/unsupported/test/mpreal_support.cpp +++ b/unsupported/test/mpreal_support.cpp @@ -1,3 +1,4 @@ +#include // Must be included before main.h. #include "main.h" #include #include -- GitLab From 5b83d3c4bcf41a0153eb77897eadde2f298f023d Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 4 Aug 2021 13:39:09 -0700 Subject: [PATCH 079/266] Make inverse 3x3 faster and avoid gcc bug. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There seems to be a gcc 4.7 bug that incorrectly flags the current 3x3 inverse as using uninitialized memory. I'm *pretty* sure it's a false positive, but it's hard to trigger. The same warning does not trigger with clang or later compiler versions. In trying to find a work-around, this implementation turns out to be faster anyways for static-sized matrices. ``` name old cpu/op new cpu/op delta BM_Inverse3x3> 423ns ± 2% 433ns ± 3% +2.32% (p=0.000 n=98+96) BM_Inverse3x3> 425ns ± 2% 427ns ± 3% +0.48% (p=0.003 n=99+96) BM_Inverse3x3> 7.10ns ± 2% 0.80ns ± 1% -88.67% (p=0.000 n=114+112) BM_Inverse3x3> 7.45ns ± 2% 1.34ns ± 1% -82.01% (p=0.000 n=105+111) BM_AliasedInverse3x3> 409ns ± 3% 419ns ± 3% +2.40% (p=0.000 n=100+98) BM_AliasedInverse3x3> 414ns ± 3% 413ns ± 2% ~ (p=0.322 n=98+98) BM_AliasedInverse3x3> 7.57ns ± 1% 0.80ns ± 1% -89.37% (p=0.000 n=111+114) BM_AliasedInverse3x3> 9.09ns ± 1% 2.58ns ±41% -71.60% (p=0.000 n=113+116) ``` (cherry picked from commit 5ad8b9bfe2bf75620bc89467c5cc051fc2a597df) --- Eigen/src/LU/InverseImpl.h | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h index 27e263945..a40cefa9e 100644 --- a/Eigen/src/LU/InverseImpl.h +++ b/Eigen/src/LU/InverseImpl.h @@ -144,13 +144,18 @@ inline void compute_inverse_size3_helper( const Matrix& cofactors_col0, ResultType& result) { - result.row(0) = cofactors_col0 * invdet; - result.coeffRef(1,0) = cofactor_3x3(matrix) * invdet; - result.coeffRef(1,1) = cofactor_3x3(matrix) * invdet; + // Compute cofactors in a way that avoids aliasing issues. + typedef typename ResultType::Scalar Scalar; + const Scalar c01 = cofactor_3x3(matrix) * invdet; + const Scalar c11 = cofactor_3x3(matrix) * invdet; + const Scalar c02 = cofactor_3x3(matrix) * invdet; result.coeffRef(1,2) = cofactor_3x3(matrix) * invdet; - result.coeffRef(2,0) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,1) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,2) = cofactor_3x3(matrix) * invdet; + result.coeffRef(1,0) = c01; + result.coeffRef(1,1) = c11; + result.coeffRef(2,0) = c02; + result.row(0) = cofactors_col0 * invdet; } template @@ -166,12 +171,7 @@ struct compute_inverse cofactors_col0.coeffRef(2) = cofactor_3x3(matrix); const Scalar det = (cofactors_col0.cwiseProduct(matrix.col(0))).sum(); const Scalar invdet = Scalar(1) / det; - if(extract_data(matrix) != extract_data(result)) { - compute_inverse_size3_helper(matrix, invdet, cofactors_col0, result); - } else { - MatrixType matrix_t = matrix; - compute_inverse_size3_helper(matrix_t, invdet, cofactors_col0, result); - } + compute_inverse_size3_helper(matrix, invdet, cofactors_col0, result); } }; @@ -187,22 +187,16 @@ struct compute_inverse_and_det_with_check bool& invertible ) { - using std::abs; typedef typename ResultType::Scalar Scalar; Matrix cofactors_col0; cofactors_col0.coeffRef(0) = cofactor_3x3(matrix); cofactors_col0.coeffRef(1) = cofactor_3x3(matrix); cofactors_col0.coeffRef(2) = cofactor_3x3(matrix); determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum(); - invertible = abs(determinant) > absDeterminantThreshold; + invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold; if(!invertible) return; const Scalar invdet = Scalar(1) / determinant; - if(extract_data(matrix) != extract_data(inverse)) { - compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse); - } else { - MatrixType matrix_t = matrix; - compute_inverse_size3_helper(matrix_t, invdet, cofactors_col0, inverse); - } + compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse); } }; -- GitLab From 4240b480e0a5e2f9063a672d19d70e6121791310 Mon Sep 17 00:00:00 2001 From: Jens Wehner Date: Thu, 5 Aug 2021 17:21:16 +0000 Subject: [PATCH 080/266] updated documentation for middleCol and middleRow (cherry picked from commit 4d870c49b7f1b49e34e8044dc6c1131d43e91a44) --- doc/TutorialBlockOperations.dox | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/doc/TutorialBlockOperations.dox b/doc/TutorialBlockOperations.dox index a2d8c97cc..df277482c 100644 --- a/doc/TutorialBlockOperations.dox +++ b/doc/TutorialBlockOperations.dox @@ -167,6 +167,20 @@ matrix.rightCols(q);\endcode \code matrix.rightCols();\endcode +%Block containing the q columns starting from i + \link DenseBase::middleCols() * \endlink + \code +matrix.middleCols(i,q);\endcode + \code +matrix.middleCols(i);\endcode + +%Block containing the q rows starting from i + \link DenseBase::middleRows() * \endlink + \code +matrix.middleRows(i,q);\endcode + \code +matrix.middleRows(i);\endcode + Here is a simple example illustrating the use of the operations presented above: -- GitLab From 1e9f623f3edd9b9246b7752af0275165fd8a2d23 Mon Sep 17 00:00:00 2001 From: "Daniel N. Miller (APD)" Date: Wed, 23 Jun 2021 21:09:53 -0700 Subject: [PATCH 081/266] Do not build shared libs if not supported (cherry picked from commit 09d7122468fb9b9adf813cf32167ab212511c4d8) --- CMakeLists.txt | 3 +++ blas/CMakeLists.txt | 25 ++++++++++++++++--------- lapack/CMakeLists.txt | 30 +++++++++++++++++------------- 3 files changed, 36 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bd1af32b2..f3e69b845 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,9 @@ else() ei_add_cxx_compiler_flag("-std=c++03") endif() +# Determine if we should build shared libraries on this platform. +get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS) + ############################################################################# # find how to link to the standard libraries # ############################################################################# diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index 545bc989c..f3a94ec4a 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -26,20 +26,27 @@ else() set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c) endif() +set(EIGEN_BLAS_TARGETS "") + add_library(eigen_blas_static ${EigenBlas_SRCS}) -add_library(eigen_blas SHARED ${EigenBlas_SRCS}) +list(APPEND EIGEN_BLAS_TARGETS eigen_blas_static) -if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) - target_link_libraries(eigen_blas_static ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) - target_link_libraries(eigen_blas ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) +if (EIGEN_BUILD_SHARED_LIBS) + add_library(eigen_blas SHARED ${EigenBlas_SRCS}) + list(APPEND EIGEN_BLAS_TARGETS eigen_blas) endif() -add_dependencies(blas eigen_blas eigen_blas_static) +foreach(target IN LISTS EIGEN_BLAS_TARGETS) + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) + endif() -install(TARGETS eigen_blas eigen_blas_static - RUNTIME DESTINATION bin - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) + add_dependencies(blas ${target}) + install(TARGETS ${target} + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) +endforeach() if(EIGEN_Fortran_COMPILER_WORKS) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 9eec81076..e48497fda 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -88,25 +88,29 @@ endif() endif() -add_library(eigen_lapack_static ${EigenLapack_SRCS} ${ReferenceLapack_SRCS}) -add_library(eigen_lapack SHARED ${EigenLapack_SRCS}) +set(EIGEN_LAPACK_TARGETS "") -target_link_libraries(eigen_lapack eigen_blas) +add_library(eigen_lapack_static ${EigenLapack_SRCS} ${ReferenceLapack_SRCS}) +list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack_static) -if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) - target_link_libraries(eigen_lapack_static ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) - target_link_libraries(eigen_lapack ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) +if (EIGEN_BUILD_SHARED_LIBS) + add_library(eigen_lapack SHARED ${EigenLapack_SRCS}) + list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack) + target_link_libraries(eigen_lapack eigen_blas) endif() -add_dependencies(lapack eigen_lapack eigen_lapack_static) +foreach(target IN LISTS EIGEN_LAPACK_TARGETS) + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) + endif() + add_dependencies(lapack ${target}) + install(TARGETS ${target} + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) +endforeach() -install(TARGETS eigen_lapack eigen_lapack_static - RUNTIME DESTINATION bin - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) - - get_filename_component(eigen_full_path_to_testing_lapack "./testing/" ABSOLUTE) if(EXISTS ${eigen_full_path_to_testing_lapack}) -- GitLab From 4e0357c6dd6fa4f024362f3affdcac6b24253815 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 6 Aug 2021 20:48:10 +0000 Subject: [PATCH 082/266] Avoid memory allocation in tridiagonalization_inplace_selector::run. (cherry picked from commit a5a7faeb455efd7f6edb1138eda2e37546039b7d) --- .../src/Eigenvalues/SelfAdjointEigenSolver.h | 7 ++++++- Eigen/src/Eigenvalues/Tridiagonalization.h | 20 +++++++++---------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 59e59644e..14692365f 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -125,6 +125,7 @@ template class SelfAdjointEigenSolver : m_eivec(), m_eivalues(), m_subdiag(), + m_hcoeffs(), m_info(InvalidInput), m_isInitialized(false), m_eigenvectorsOk(false) @@ -147,6 +148,7 @@ template class SelfAdjointEigenSolver : m_eivec(size, size), m_eivalues(size), m_subdiag(size > 1 ? size - 1 : 1), + m_hcoeffs(size > 1 ? size - 1 : 1), m_isInitialized(false), m_eigenvectorsOk(false) {} @@ -172,6 +174,7 @@ template class SelfAdjointEigenSolver : m_eivec(matrix.rows(), matrix.cols()), m_eivalues(matrix.cols()), m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1), + m_hcoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1), m_isInitialized(false), m_eigenvectorsOk(false) { @@ -378,6 +381,7 @@ template class SelfAdjointEigenSolver EigenvectorsType m_eivec; RealVectorType m_eivalues; typename TridiagonalizationType::SubDiagonalType m_subdiag; + typename TridiagonalizationType::CoeffVectorType m_hcoeffs; ComputationInfo m_info; bool m_isInitialized; bool m_eigenvectorsOk; @@ -450,7 +454,8 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver if(scale==RealScalar(0)) scale = RealScalar(1); mat.template triangularView() /= scale; m_subdiag.resize(n-1); - internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors); + m_hcoeffs.resize(n-1); + internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, computeEigenvectors); m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec); diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index 6c8084f76..674c92a39 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -425,12 +425,13 @@ struct tridiagonalization_inplace_selector; * * \sa class Tridiagonalization */ -template +template EIGEN_DEVICE_FUNC -void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) +void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, + CoeffVectorType& hcoeffs, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); - tridiagonalization_inplace_selector::run(mat, diag, subdiag, extractQ); + tridiagonalization_inplace_selector::run(mat, diag, subdiag, hcoeffs, extractQ); } /** \internal @@ -443,10 +444,9 @@ struct tridiagonalization_inplace_selector typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; template static EIGEN_DEVICE_FUNC - void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ) { - CoeffVectorType hCoeffs(mat.cols()-1); - tridiagonalization_inplace(mat,hCoeffs); + tridiagonalization_inplace(mat, hCoeffs); diag = mat.diagonal().real(); subdiag = mat.template diagonal<-1>().real(); if(extractQ) @@ -466,8 +466,8 @@ struct tridiagonalization_inplace_selector typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + template + static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&, bool extractQ) { using std::sqrt; const RealScalar tol = (std::numeric_limits::min)(); @@ -511,9 +511,9 @@ struct tridiagonalization_inplace_selector { typedef typename MatrixType::Scalar Scalar; - template + template static EIGEN_DEVICE_FUNC - void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, CoeffVectorType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) -- GitLab From 93bff85a4277df59d7763995229fcc157eff0874 Mon Sep 17 00:00:00 2001 From: Gauri Deshpande Date: Mon, 9 Aug 2021 22:15:21 +0000 Subject: [PATCH 083/266] remove denormal flushing in fp32tobf16 for avx & avx512 (cherry picked from commit e6a5a594a7f3cbe2f9843d4ef57a10d478cbb818) --- Eigen/src/Core/arch/AVX/PacketMath.h | 11 +-- Eigen/src/Core/arch/AVX512/PacketMath.h | 18 ++--- Eigen/src/Core/arch/Default/BFloat16.h | 7 -- test/bfloat16_float.cpp | 89 ------------------------- 4 files changed, 8 insertions(+), 117 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index dd3f243d2..7fc32fd71 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -1274,12 +1274,7 @@ EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) { EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { Packet8bf r; - // Flush input denormals value to zero with hardware capability. - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); - __m256 flush = _mm256_and_ps(a, a); - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); - - __m256i input = _mm256_castps_si256(flush); + __m256i input = _mm256_castps_si256(a); #ifdef EIGEN_VECTORIZE_AVX2 // uint32_t lsb = (input >> 16); @@ -1293,7 +1288,7 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { // input = input >> 16; t = _mm256_srli_epi32(t, 16); // Check NaN before converting back to bf16 - __m256 mask = _mm256_cmp_ps(flush, flush, _CMP_ORD_Q); + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); __m256i nan = _mm256_set1_epi32(0x7fc0); t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask)); // output = numext::bit_cast(input); @@ -1316,7 +1311,7 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { lo = _mm_srli_epi32(lo, 16); hi = _mm_srli_epi32(hi, 16); // Check NaN before converting back to bf16 - __m256 mask = _mm256_cmp_ps(flush, flush, _CMP_ORD_Q); + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); __m128i nan = _mm_set1_epi32(0x7fc0); lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask))); hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1))); diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 59bbef0d1..34d49ab66 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -1945,23 +1945,15 @@ EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) { EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) { Packet16bf r; - // Flush input denormals value to zero with hardware capability. - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); -#if defined(EIGEN_VECTORIZE_AVX512DQ) - __m512 flush = _mm512_and_ps(a, a); -#else - __m512 flush = _mm512_max_ps(a, a); -#endif // EIGEN_VECTORIZE_AVX512DQ - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); - #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1) // Since GCC 10.1 supports avx512bf16 and C style explicit cast // (C++ static_cast is not supported yet), do converion via intrinsic // and register path for performance. - r = (__m256i)(_mm512_cvtneps_pbh(flush)); + r = (__m256i)(_mm512_cvtneps_pbh(a)); + #else __m512i t; - __m512i input = _mm512_castps_si512(flush); + __m512i input = _mm512_castps_si512(a); __m512i nan = _mm512_set1_epi32(0x7fc0); // uint32_t lsb = (input >> 16) & 1; @@ -1974,9 +1966,9 @@ EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) { t = _mm512_srli_epi32(t, 16); // Check NaN before converting back to bf16 - __mmask16 mask = _mm512_cmp_ps_mask(flush, flush, _CMP_ORD_Q); - t = _mm512_mask_blend_epi32(mask, nan, t); + __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q); + t = _mm512_mask_blend_epi32(mask, nan, t); // output.value = static_cast(input); r = _mm512_cvtepi32_epi16(t); #endif // EIGEN_VECTORIZE_AVX512BF16 diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h index aac60f15c..1c28f4f95 100644 --- a/Eigen/src/Core/arch/Default/BFloat16.h +++ b/Eigen/src/Core/arch/Default/BFloat16.h @@ -250,10 +250,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) { output.value = std::signbit(v) ? 0xFFC0: 0x7FC0; return output; - } else if (std::fabs(v) < std::numeric_limits::min EIGEN_NOT_A_MACRO()) { - // Flush denormal to +/- 0. - output.value = std::signbit(v) ? 0x8000 : 0; - return output; } const uint16_t* p = reinterpret_cast(&v); #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ @@ -288,9 +284,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne::min EIGEN_NOT_A_MACRO()) { - // Flush denormal to +/- 0.0 - output.value = std::signbit(ff) ? 0x8000 : 0; } else { // Fast rounding algorithm that rounds a half value to nearest even. This // reduces expected error when we convert a large number of floats. Here diff --git a/test/bfloat16_float.cpp b/test/bfloat16_float.cpp index 1df22f73e..c3de0b19a 100644 --- a/test/bfloat16_float.cpp +++ b/test/bfloat16_float.cpp @@ -32,18 +32,6 @@ float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa, return dest; } -void test_truncate(float input, float expected_truncation, float expected_rounding){ - bfloat16 truncated = Eigen::bfloat16_impl::truncate_to_bfloat16(input); - bfloat16 rounded = Eigen::bfloat16_impl::float_to_bfloat16_rtne(input); - if ((numext::isnan)(input)){ - VERIFY((numext::isnan)(static_cast(truncated)) || (numext::isinf)(static_cast(truncated))); - VERIFY((numext::isnan)(static_cast(rounded)) || (numext::isinf)(static_cast(rounded))); - return; - } - VERIFY_IS_EQUAL(expected_truncation, static_cast(truncated)); - VERIFY_IS_EQUAL(expected_rounding, static_cast(rounded)); -} - template void test_roundtrip() { // Representable T round trip via bfloat16 @@ -122,31 +110,6 @@ void test_conversion() VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.0f), 0x0000); VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-0.0f), 0x8000); - // Flush denormals to zero - for (float denorm = -std::numeric_limits::denorm_min(); - denorm < std::numeric_limits::denorm_min(); - denorm = nextafterf(denorm, 1.0f)) { - bfloat16 bf_trunc = Eigen::bfloat16_impl::truncate_to_bfloat16(denorm); - VERIFY_IS_EQUAL(static_cast(bf_trunc), 0.0f); - - // Implicit conversion of denormls to bool is correct - VERIFY_IS_EQUAL(static_cast(bfloat16(denorm)), false); - VERIFY_IS_EQUAL(bfloat16(denorm), false); - - if (std::signbit(denorm)) { - VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x8000); - } else { - VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x0000); - } - bfloat16 bf_round = Eigen::bfloat16_impl::float_to_bfloat16_rtne(denorm); - VERIFY_IS_EQUAL(static_cast(bf_round), 0.0f); - if (std::signbit(denorm)) { - VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x8000); - } else { - VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x0000); - } - } - // Default is zero VERIFY_IS_EQUAL(static_cast(bfloat16()), 0.0f); @@ -156,52 +119,6 @@ void test_conversion() test_roundtrip >(); test_roundtrip >(); - // Truncate test - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0xf5c3), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x49, 0x0000)); - test_truncate( - BinaryToFloat(1, 0x80, 0x48, 0xf5c3), - BinaryToFloat(1, 0x80, 0x48, 0x0000), - BinaryToFloat(1, 0x80, 0x49, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0x8000), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000)); - test_truncate( - BinaryToFloat(0, 0xff, 0x00, 0x0001), - BinaryToFloat(0, 0xff, 0x40, 0x0000), - BinaryToFloat(0, 0xff, 0x40, 0x0000)); - test_truncate( - BinaryToFloat(0, 0xff, 0x7f, 0xffff), - BinaryToFloat(0, 0xff, 0x40, 0x0000), - BinaryToFloat(0, 0xff, 0x40, 0x0000)); - test_truncate( - BinaryToFloat(1, 0x80, 0x48, 0xc000), - BinaryToFloat(1, 0x80, 0x48, 0x0000), - BinaryToFloat(1, 0x80, 0x49, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0x4000), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0x8000), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x00, 0x48, 0x8000), - BinaryToFloat(0, 0x00, 0x00, 0x0000), - BinaryToFloat(0, 0x00, 0x00, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x00, 0x7f, 0xc000), - BinaryToFloat(0, 0x00, 0x00, 0x0000), - BinaryToFloat(0, 0x00, 0x00, 0x0000)); - // Conversion Array a; for (int i = 0; i < 100; i++) a(i) = i + 1.25; @@ -250,12 +167,6 @@ void test_conversion() VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)), 0x7fc0); VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)), 0xffc0); - VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16( - BinaryToFloat(0x0, 0xff, 0x40, 0x0)), - 0x7fc0); - VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16( - BinaryToFloat(0x1, 0xff, 0x40, 0x0)), - 0xffc0); } void test_numtraits() -- GitLab From 338924602d96fa997f7011454f506c158344403f Mon Sep 17 00:00:00 2001 From: jenswehner Date: Tue, 10 Aug 2021 13:34:57 +0200 Subject: [PATCH 084/266] added includes for unordered_map (cherry picked from commit e3e74001f7c4bf95f0dde572e8a08c5b2918a3ab) --- unsupported/test/sparse_extra.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp index cdfd10ca4..602c2cb84 100644 --- a/unsupported/test/sparse_extra.cpp +++ b/unsupported/test/sparse_extra.cpp @@ -31,6 +31,22 @@ static long g_dense_op_sparse_count = 0; #include "sparse_basic.cpp" #endif +#if EIGEN_HAS_CXX11 + +#ifdef min +#undef min +#endif + +#ifdef max +#undef max +#endif + +#include +#define EIGEN_UNORDERED_MAP_SUPPORT + +#endif + + #include template @@ -146,6 +162,7 @@ template void sparse_extra(const SparseMatrixType& re } + template void check_marketio() { -- GitLab From 13d7658c5d0651f1ea16b95c3b47cbefb706135f Mon Sep 17 00:00:00 2001 From: ChipKerchner Date: Tue, 10 Aug 2021 15:03:18 -0500 Subject: [PATCH 085/266] Fix errors on older compilers (gcc 7.5 - lack of vec_neg, clang10 - can not use const pointers with vec_xl). (cherry picked from commit 413bc491f1721afdb9802553b13a5b7aba67ed3b) --- Eigen/src/Core/arch/AltiVec/Complex.h | 4 ++-- Eigen/src/Core/arch/AltiVec/PacketMath.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index e1711930b..f424f11cf 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -74,7 +74,7 @@ struct Packet2cf return Packet2cf(*this) -= b; } EIGEN_STRONG_INLINE Packet2cf operator-(void) const { - return Packet2cf(vec_neg(v)); + return Packet2cf(-v); } Packet4f v; @@ -294,7 +294,7 @@ struct Packet1cd return Packet1cd(*this) -= b; } EIGEN_STRONG_INLINE Packet1cd operator-(void) const { - return Packet1cd(vec_neg(v)); + return Packet1cd(-v); } Packet2d v; diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 8c42f495c..f48f261ba 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -433,7 +433,7 @@ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD #ifdef __VSX__ - return vec_xl(0, from); + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #else return vec_ld(0, from); #endif @@ -952,7 +952,7 @@ template EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPAC return (Packet) vec_perm(MSQ, LSQ, mask); // align the data #else EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl(0, from); + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #endif } @@ -2453,7 +2453,7 @@ template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl(0, from); + return vec_xl(0, const_cast(from)); } template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) -- GitLab From cb44a003de4f9795f080dcd7481fcc4fc7a1713d Mon Sep 17 00:00:00 2001 From: Nikolay Tverdokhleb Date: Thu, 5 Aug 2021 14:15:41 +0200 Subject: [PATCH 086/266] Do not set AnnoyingScalar::dont_throw if not defined EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW. - Because that member is not declared if the macro is defined. (cherry picked from commit f1b899eef7461e1475469b733346c6ebbfae8818) --- test/conservative_resize.cpp | 6 +++++- test/sparse_block.cpp | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/test/conservative_resize.cpp b/test/conservative_resize.cpp index d709e3346..d48eb126f 100644 --- a/test/conservative_resize.cpp +++ b/test/conservative_resize.cpp @@ -115,9 +115,11 @@ template void noncopyable() { typedef Eigen::Matrix VectorType; typedef Eigen::Matrix MatrixType; - + { +#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW AnnoyingScalar::dont_throw = true; +#endif int n = 50; VectorType v0(n), v1(n); MatrixType m0(n,n), m1(n,n), m2(n,n); @@ -156,7 +158,9 @@ EIGEN_DECLARE_TEST(conservative_resize) CALL_SUBTEST_4((run_vector_tests >())); CALL_SUBTEST_5((run_vector_tests >())); +#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW AnnoyingScalar::dont_throw = true; +#endif CALL_SUBTEST_6(( run_vector_tests() )); CALL_SUBTEST_6(( noncopyable<0>() )); } diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp index f9668102c..b4905b053 100644 --- a/test/sparse_block.cpp +++ b/test/sparse_block.cpp @@ -315,8 +315,9 @@ EIGEN_DECLARE_TEST(sparse_block) CALL_SUBTEST_4(( sparse_block(SparseMatrix(short(r), short(c))) )); CALL_SUBTEST_4(( sparse_block(SparseMatrix(short(r), short(c))) )); - +#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW AnnoyingScalar::dont_throw = true; +#endif CALL_SUBTEST_5(( sparse_block(SparseMatrix(r,c)) )); } } -- GitLab From 6d2506040cb7dc163c8bcd392535dc83698b5388 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 11 Aug 2021 18:10:01 +0000 Subject: [PATCH 087/266] * revise the meta_least_common_multiple function template, add a bool variable to check whether the A is larger than B. * This can make less compile_time if A is smaller than B. and avoid failure in compile if we get a little A and a great B. Authored by @awoniu. (cherry picked from commit 8ce341caf2947e4b5ac4580c20254ae7d828b009) --- Eigen/src/Core/util/Meta.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 2429ddad2..81ae2a32d 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -715,20 +715,25 @@ class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? /** \internal Computes the least common multiple of two positive integer A and B - * at compile-time. It implements a naive algorithm testing all multiples of A. - * It thus works better if A>=B. + * at compile-time. */ -template +template=B)> struct meta_least_common_multiple { enum { ret = meta_least_common_multiple::ret }; }; +template +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple::ret }; +}; template -struct meta_least_common_multiple +struct meta_least_common_multiple { enum { ret = A*K }; }; + /** \internal determines whether the product of two numeric types is allowed and what the return type is */ template struct scalar_product_traits { -- GitLab From 0d890127082a0c9cde813dc43f0a9323c47d7369 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 12 Aug 2021 08:30:12 -0700 Subject: [PATCH 088/266] Update code snippet for tridiagonalize_inplace. (cherry picked from commit fb1718ad14485ccf733d90807253e47c1f72e275) --- doc/snippets/Tridiagonalization_decomposeInPlace.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/snippets/Tridiagonalization_decomposeInPlace.cpp b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp index 93dcfca1d..3cdce679b 100644 --- a/doc/snippets/Tridiagonalization_decomposeInPlace.cpp +++ b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp @@ -4,7 +4,8 @@ cout << "Here is a random symmetric 5x5 matrix:" << endl << A << endl << endl; VectorXd diag(5); VectorXd subdiag(4); -internal::tridiagonalization_inplace(A, diag, subdiag, true); +VectorXd hcoeffs(4); // Scratch space for householder reflector. +internal::tridiagonalization_inplace(A, diag, subdiag, hcoeffs, true); cout << "The orthogonal matrix Q is:" << endl << A << endl; cout << "The diagonal of the tridiagonal matrix T is:" << endl << diag << endl; cout << "The subdiagonal of the tridiagonal matrix T is:" << endl << subdiag << endl; -- GitLab From 576e451b10cca9c465f19cc3cf9fd7e98c60602c Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 12 Aug 2021 08:54:29 -0700 Subject: [PATCH 089/266] Add CompleteOrthogonalDecomposition to the table of linear algeba decompositions. (cherry picked from commit 96e3b4fc957834ad6736f7455c263d3a4158dc37) --- doc/TopicLinearAlgebraDecompositions.dox | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox index 0965da872..c960fb836 100644 --- a/doc/TopicLinearAlgebraDecompositions.dox +++ b/doc/TopicLinearAlgebraDecompositions.dox @@ -72,7 +72,7 @@ To get an overview of the true relative speed of the different decompositions, c Orthogonalization Yes Excellent - Soon: blocking + - @@ -87,6 +87,18 @@ To get an overview of the true relative speed of the different decompositions, c - + + CompleteOrthogonalDecomposition + - + Fast + Good + Yes + Orthogonalization + Yes + Excellent + - + + LLT Positive definite -- GitLab From 44cc96e1a10692e72074fa62835a762b6512521f Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 12 Aug 2021 21:38:54 +0000 Subject: [PATCH 090/266] Get rid of used uninitialized warnings for EIGEN_UNUSED_VARIABLE in gcc11+ (cherry picked from commit 66499f0f172d0758360043e9c578761c0f7d50cd) --- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 16 ++++++++-------- Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h | 8 ++++---- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 454b36cd6..380dc623b 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -1570,7 +1570,7 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( const Packet& pAlpha) { const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; PacketBlock acc; @@ -1607,7 +1607,7 @@ EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration( const Packet& pAlpha) { const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, *lhs_ptr7; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL; PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; PacketBlock acc; @@ -2180,9 +2180,9 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1; - const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3; - const Scalar* lhs_ptr_real4, * lhs_ptr_imag4; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; + const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; PacketBlock accReal0, accImag0, accReal1, accImag1; PacketBlock accReal2, accImag2, accReal3, accImag3; PacketBlock accReal4, accImag4; @@ -2234,9 +2234,9 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration( } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1; - const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3; - const Scalar* lhs_ptr_real4, * lhs_ptr_imag4; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; + const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; PacketBlock accReal0, accImag0, accReal1, accImag1; PacketBlock accReal2, accImag2, accReal3, accImag3; PacketBlock accReal4, accImag4; diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 13d9517e4..6540c6fa6 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -256,7 +256,7 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( const Packet& pAlpha) { const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; MICRO_MMA_SRC_PTR @@ -510,9 +510,9 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1; - const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3; - const Scalar* lhs_ptr_real4, * lhs_ptr_imag4; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; + const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4; MICRO_COMPLEX_MMA_SRC_PTR -- GitLab From 0b56b62f30bec7ac27fe50f7c1d8ffce299218b7 Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Fri, 13 Aug 2021 11:21:28 -0500 Subject: [PATCH 091/266] =?UTF-8?q?Reverse=20compare=20logic=20=C2=83in=20?= =?UTF-8?q?F32ToBf16=20since=20vec=5Fcmpne=20is=20not=20available=20in=20P?= =?UTF-8?q?ower8=20-=20now=20compiles=20for=20clang10=20default=20(P8).?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit e07227c411cb5ed5c6252b594fe841867bd19f6a) --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index f48f261ba..2503f29e0 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -902,8 +902,8 @@ template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a, con return pxor(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { return vec_sel(b, a, reinterpret_cast(mask)); @@ -1260,15 +1260,15 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){ Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp); Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast(p4i_ZERO)); - Packet4bi is_mant_not_zero = vec_cmpne(mantissa, reinterpret_cast(p4i_ZERO)); - Packet4ui nan_selector = pand( + Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast(p4i_ZERO)); + Packet4ui nan_selector = pandnot( reinterpret_cast(is_max_exp), - reinterpret_cast(is_mant_not_zero) + reinterpret_cast(is_mant_zero) ); - Packet4ui subnormal_selector = pand( + Packet4ui subnormal_selector = pandnot( reinterpret_cast(is_zero_exp), - reinterpret_cast(is_mant_not_zero) + reinterpret_cast(is_mant_zero) ); const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000); -- GitLab From cd474d4cd0ad35b02a3f63c226ca933fa379a841 Mon Sep 17 00:00:00 2001 From: andiwand Date: Mon, 16 Aug 2021 12:02:33 +0000 Subject: [PATCH 092/266] minor doc fix in Map.h (cherry picked from commit 5c6b3efead69636dec1599aa54dab4617755013c) --- Eigen/src/Core/Map.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 93d2ae907..218cc157f 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -47,7 +47,7 @@ private: * \brief A matrix or vector expression mapping an existing array of data. * * \tparam PlainObjectType the equivalent matrix type of the mapped data - * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. + * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. * The default is \c #Unaligned. * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout * of an ordinary, contiguous array. This can be overridden by specifying strides. -- GitLab From 926e1a8226837fb1e672e335e0c411df74f7479a Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 16 Aug 2021 21:56:18 +0000 Subject: [PATCH 093/266] Update documentation for matrix decompositions and least squares solvers. (cherry picked from commit 7e6f94961cb4444d3c20660d8cc492d28ada1415) --- doc/LeastSquares.dox | 15 ++++-- doc/TopicLinearAlgebraDecompositions.dox | 4 +- doc/TutorialLinearAlgebra.dox | 69 +++++++++++++----------- 3 files changed, 50 insertions(+), 38 deletions(-) diff --git a/doc/LeastSquares.dox b/doc/LeastSquares.dox index 24dfe4b4f..ddbf38dec 100644 --- a/doc/LeastSquares.dox +++ b/doc/LeastSquares.dox @@ -30,14 +30,17 @@ computing least squares solutions: This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink. +If you just need to solve the least squares problem, but are not interested in the SVD per se, a +faster alternative method is CompleteOrthogonalDecomposition. \section LeastSquaresQR Using the QR decomposition The solve() method in QR decomposition classes also computes the least squares solution. There are -three QR decomposition classes: HouseholderQR (no pivoting, so fast but unstable), -ColPivHouseholderQR (column pivoting, thus a bit slower but more accurate) and FullPivHouseholderQR -(full pivoting, so slowest and most stable). Here is an example with column pivoting: +three QR decomposition classes: HouseholderQR (no pivoting, fast but unstable if your matrix is +not rull rank), ColPivHouseholderQR (column pivoting, thus a bit slower but more stable) and +FullPivHouseholderQR (full pivoting, so slowest and slightly more stable than ColPivHouseholderQR). +Here is an example with column pivoting: @@ -61,9 +64,11 @@ Finding the least squares solution of \a Ax = \a b is equivalent to solving the
Example:Output:
-If the matrix \a A is ill-conditioned, then this is not a good method, because the condition number +This method is usually the fastest, especially when \a A is "tall and skinny". However, if the +matrix \a A is even mildly ill-conditioned, this is not a good method, because the condition number of ATA is the square of the condition number of \a A. This means that you -lose twice as many digits using normal equation than if you use the other methods. +lose roughly twice as many digits of accuracy using the normal equation, compared to the more stable +methods mentioned above. */ diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox index c960fb836..402b3769e 100644 --- a/doc/TopicLinearAlgebraDecompositions.dox +++ b/doc/TopicLinearAlgebraDecompositions.dox @@ -99,7 +99,7 @@ To get an overview of the true relative speed of the different decompositions, c - - + LLT Positive definite Very fast @@ -111,7 +111,7 @@ To get an overview of the true relative speed of the different decompositions, c Blocking - + LDLT Positive or negative semidefinite1 Very fast diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox index a72724143..8042fcad3 100644 --- a/doc/TutorialLinearAlgebra.dox +++ b/doc/TutorialLinearAlgebra.dox @@ -14,7 +14,7 @@ QR, %SVD, eigendecompositions... After reading this page, don't miss our \f[ Ax \: = \: b \f] Where \a A and \a b are matrices (\a b could be a vector, as a special case). You want to find a solution \a x. -\b The \b solution: You can choose between various decompositions, depending on what your matrix \a A looks like, +\b The \b solution: You can choose between various decompositions, depending on the properties of your matrix \a A, and depending on whether you favor speed or accuracy. However, let's start with an example that works in all cases, and is a good compromise: @@ -34,7 +34,7 @@ Vector3f x = dec.solve(b); Here, ColPivHouseholderQR is a QR decomposition with column pivoting. It's a good compromise for this tutorial, as it works for all matrices while being quite fast. Here is a table of some other decompositions that you can choose from, -depending on your matrix and the trade-off you want to make: +depending on your matrix, the problem you are trying to solve, and the trade-off you want to make:
@@ -128,11 +128,13 @@ depending on your matrix and the trade-off you want to make:
To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink. -All of these decompositions offer a solve() method that works as in the above example. +All of these decompositions offer a solve() method that works as in the above example. -For example, if your matrix is positive definite, the above table says that a very good -choice is then the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general -matrix (not a vector) as right hand side is possible. +If you know more about the properties of your matrix, you can use the above table to select the best method. +For example, a good choice for solving linear systems with a non-symmetric matrix of full rank is PartialPivLU. +If you know that your matrix is also symmetric and positive definite, the above table says that +a very good choice is the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general +matrix (not a vector) as right hand side is possible: @@ -146,7 +148,34 @@ For a \ref TopicLinearAlgebraDecompositions "much more complete table" comparing supports many other decompositions), see our special page on \ref TopicLinearAlgebraDecompositions "this topic". -\section TutorialLinAlgSolutionExists Checking if a solution really exists + +\section TutorialLinAlgLeastsquares Least squares solving + +The most general and accurate method to solve under- or over-determined linear systems +in the least squares sense, is the SVD decomposition. Eigen provides two implementations. +The recommended one is the BDCSVD class, which scales well for large problems +and automatically falls back to the JacobiSVD class for smaller problems. +For both classes, their solve() method solved the linear system in the least-squares +sense. + +Here is an example: +
Example:Output:
+ + + + + +
Example:Output:
\include TutorialLinAlgSVDSolve.cpp \verbinclude TutorialLinAlgSVDSolve.out
+ +An alternative to the SVD, which is usually faster and about as accurate, is CompleteOrthogonalDecomposition. + +Again, if you know more about the problem, the table above contains methods that are potentially faster. +If your matrix is full rank, HouseHolderQR is the method of choice. If your matrix is full rank and well conditioned, +using the Cholesky decomposition (LLT) on the matrix of the normal equations can be faster still. +Our page on \link LeastSquares least squares solving \endlink has more details. + + +\section TutorialLinAlgSolutionExists Checking if a matrix is singular Only you know what error margin you want to allow for a solution to be considered valid. So Eigen lets you do this computation for yourself, if you want to, as in this example: @@ -179,11 +208,11 @@ very rare. The call to info() is to check for this possibility. \section TutorialLinAlgInverse Computing inverse and determinant First of all, make sure that you really want this. While inverse and determinant are fundamental mathematical concepts, -in \em numerical linear algebra they are not as popular as in pure mathematics. Inverse computations are often +in \em numerical linear algebra they are not as useful as in pure mathematics. Inverse computations are often advantageously replaced by solve() operations, and the determinant is often \em not a good way of checking if a matrix is invertible. -However, for \em very \em small matrices, the above is not true, and inverse and determinant can be very useful. +However, for \em very \em small matrices, the above may not be true, and inverse and determinant can be very useful. While certain decompositions, such as PartialPivLU and FullPivLU, offer inverse() and determinant() methods, you can also call inverse() and determinant() directly on a matrix. If your matrix is of a very small fixed size (at most 4x4) this @@ -198,28 +227,6 @@ Here is an example: -\section TutorialLinAlgLeastsquares Least squares solving - -The most accurate method to do least squares solving is with a SVD decomposition. -Eigen provides two implementations. -The recommended one is the BDCSVD class, which scale well for large problems -and automatically fall-back to the JacobiSVD class for smaller problems. -For both classes, their solve() method is doing least-squares solving. - -Here is an example: - - - - - - -
Example:Output:
\include TutorialLinAlgSVDSolve.cpp \verbinclude TutorialLinAlgSVDSolve.out
- -Another methods, potentially faster but less reliable, are to use a Cholesky decomposition of the -normal matrix or a QR decomposition. Our page on \link LeastSquares least squares solving \endlink -has more details. - - \section TutorialLinAlgSeparateComputation Separating the computation from the construction In the above examples, the decomposition was computed at the same time that the decomposition object was constructed. -- GitLab From f57dec64efa761431435e5aa9eb58ee654387f7a Mon Sep 17 00:00:00 2001 From: Chip-Kerchner Date: Mon, 16 Aug 2021 20:26:50 -0500 Subject: [PATCH 094/266] Fix unaligned loads in ploadLhs & ploadRhs for P8. (cherry picked from commit 8dcf3e38ba9913021ce6a831836a59217e21baf2) --- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 2 +- Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 380dc623b..3f79b97df 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -1113,7 +1113,7 @@ EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) { - return *reinterpret_cast(const_cast(lhs)); + return ploadu(lhs); } // Zero the accumulator on PacketBlock. diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index 41b27bf3d..33d543494 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -214,7 +214,7 @@ EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs) { - return *reinterpret_cast(const_cast(rhs)); + return ploadu(rhs); } } // end namespace internal -- GitLab From f1032255d3a5e51a6a5e17bab1419b30eb050959 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 17 Aug 2021 07:42:04 -0700 Subject: [PATCH 095/266] Add missing PPC packet comparisons. This is to fix the packetmath tests on the ppc pipeline. (cherry picked from commit 2cc6ee0d2e76e88fe1476f6b0eae12edb68b1c8a) --- Eigen/src/Core/arch/AltiVec/PacketMath.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 2503f29e0..2a440545b 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -867,17 +867,26 @@ template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmple(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmpeq(a,b)); } -template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmpeq(a,b)); } -template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmpeq(a,b)); } - -template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmpeq(a,b)); } -template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmpeq(a,b)); } - template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { Packet4f c = reinterpret_cast(vec_cmpge(a,b)); return vec_nor(c,c); } + +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmpeq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } -- GitLab From aef926abf6b7949b16d00810eac6aa3495949747 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 17 Aug 2021 20:04:48 -0700 Subject: [PATCH 096/266] Renamed shift_left/shift_right to shiftLeft/shiftRight. For naming consistency. Also moved to ArrayCwiseUnaryOps, and added test. (cherry picked from commit fc9d352432b81210f73d71caecbd7dc5505d6ab8) --- Eigen/src/plugins/ArrayCwiseUnaryOps.h | 39 ++++++++++++++++++++++ Eigen/src/plugins/CommonCwiseUnaryOps.h | 43 ------------------------- test/array_cwise.cpp | 37 +++++++++++++++++++++ 3 files changed, 76 insertions(+), 43 deletions(-) diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index b7ea22a9d..13c55f4b1 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -497,6 +497,45 @@ ceil() const return CeilReturnType(derived()); } +template struct ShiftRightXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type arithmetically + * shifted right by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftLeft() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftRightXpr::Type +shiftRight() const +{ + return typename ShiftRightXpr::Type(derived()); +} + + +template struct ShiftLeftXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type logically + * shifted left by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftRight() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftLeftXpr::Type +shiftLeft() const +{ + return typename ShiftLeftXpr::Type(derived()); +} + /** \returns an expression of the coefficient-wise isnan of *this. * * Example: \include Cwise_isNaN.cpp diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h index 42ff901ca..5418dc415 100644 --- a/Eigen/src/plugins/CommonCwiseUnaryOps.h +++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h @@ -64,49 +64,6 @@ cast() const return typename CastXpr::Type(derived()); } -template struct ShiftRightXpr { - typedef CwiseUnaryOp, const Derived> Type; -}; - -/// \returns an expression of \c *this with the \a Scalar type arithmetically -/// shifted right by \a N bit positions. -/// -/// The template parameter \a N specifies the number of bit positions to shift. -/// -EIGEN_DOC_UNARY_ADDONS(cast,conversion function) -/// -/// \sa class CwiseUnaryOp -/// -template -EIGEN_DEVICE_FUNC -typename ShiftRightXpr::Type -shift_right() const -{ - return typename ShiftRightXpr::Type(derived()); -} - - -template struct ShiftLeftXpr { - typedef CwiseUnaryOp, const Derived> Type; -}; - -/// \returns an expression of \c *this with the \a Scalar type logically -/// shifted left by \a N bit positions. -/// -/// The template parameter \a N specifies the number of bit positions to shift. -/// -EIGEN_DOC_UNARY_ADDONS(cast,conversion function) -/// -/// \sa class CwiseUnaryOp -/// -template -EIGEN_DEVICE_FUNC -typename ShiftLeftXpr::Type -shift_left() const -{ - return typename ShiftLeftXpr::Type(derived()); -} - /// \returns an expression of the complex conjugate of \c *this. /// EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index 1bc8e19f9..0cc438b39 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -626,6 +626,41 @@ template void min_max(const ArrayType& m) } } +template +struct shift_left { + template + Scalar operator()(const Scalar& v) const { + return v << N; + } +}; + +template +struct arithmetic_shift_right { + template + Scalar operator()(const Scalar& v) const { + return v >> N; + } +}; + +template void array_integer(const ArrayType& m) +{ + Index rows = m.rows(); + Index cols = m.cols(); + + ArrayType m1 = ArrayType::Random(rows, cols), + m2(rows, cols); + + m2 = m1.template shiftLeft<2>(); + VERIFY( (m2 == m1.unaryExpr(shift_left<2>())).all() ); + m2 = m1.template shiftLeft<9>(); + VERIFY( (m2 == m1.unaryExpr(shift_left<9>())).all() ); + + m2 = m1.template shiftRight<2>(); + VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<2>())).all() ); + m2 = m1.template shiftRight<9>(); + VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<9>())).all() ); +} + EIGEN_DECLARE_TEST(array_cwise) { for(int i = 0; i < g_repeat; i++) { @@ -636,6 +671,8 @@ EIGEN_DECLARE_TEST(array_cwise) CALL_SUBTEST_5( array(ArrayXXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( array(ArrayXXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( array(Array(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_6( array_integer(ArrayXXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_6( array_integer(Array(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( comparisons(Array()) ); -- GitLab From 1ec173b54eb4eb7046aab160589573aa3b9bb8bc Mon Sep 17 00:00:00 2001 From: Jakob Struye Date: Wed, 18 Aug 2021 15:04:53 +0200 Subject: [PATCH 097/266] Clearer doc for squaredNorm (cherry picked from commit 53a29c7e351646efe31ee85666c8f268f8e0d462) --- Eigen/src/Core/Dot.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 41a8cb437..5c3441b92 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -86,7 +86,7 @@ MatrixBase::dot(const MatrixBase& other) const //---------- implementation of L2 norm and related functions ---------- -/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm. +/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm. * In both cases, it consists in the sum of the square of all the matrix entries. * For vectors, this is also equals to the dot product of \c *this with itself. * -- GitLab From fd100138dd5e27eb046bb32d7082d0074c76c061 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 17 Aug 2021 09:43:15 -0700 Subject: [PATCH 098/266] Remove unaligned assert tests. Manually constructing an unaligned object declared as aligned invokes UB, so we cannot technically check for alignment from within the constructor. Newer versions of clang optimize away this check. Removing the affected tests. (cherry picked from commit 0c4ae56e3797cc6719a8d08a0dafad0a5139a5f9) --- test/CMakeLists.txt | 1 - test/geo_hyperplane.cpp | 5 - test/geo_parametrizedline.cpp | 5 - test/geo_quaternion.cpp | 8 -- test/geo_transformations.cpp | 5 - test/unalignedassert.cpp | 180 ---------------------------------- 6 files changed, 204 deletions(-) delete mode 100644 test/unalignedassert.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 56664e783..5136f82aa 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -164,7 +164,6 @@ ei_add_test(nullary) ei_add_test(mixingtypes) ei_add_test(io) ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") -ei_add_test(unalignedassert) ei_add_test(vectorization_logic) ei_add_test(basicstuff) ei_add_test(constructor) diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp index 2c89ecd21..44b2f2aec 100644 --- a/test/geo_hyperplane.cpp +++ b/test/geo_hyperplane.cpp @@ -172,11 +172,6 @@ template void hyperplane_alignment() VERIFY_IS_APPROX(p1->coeffs(), p2->coeffs()); VERIFY_IS_APPROX(p1->coeffs(), p3->coeffs()); - - #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES > 0 - if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) - VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Plane3a)); - #endif } diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp index 7135c8fa5..e4b194abc 100644 --- a/test/geo_parametrizedline.cpp +++ b/test/geo_parametrizedline.cpp @@ -110,11 +110,6 @@ template void parametrizedline_alignment() VERIFY_IS_APPROX(p1->origin(), p3->origin()); VERIFY_IS_APPROX(p1->direction(), p2->direction()); VERIFY_IS_APPROX(p1->direction(), p3->direction()); - - #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 - if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) - VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Line4a)); - #endif } EIGEN_DECLARE_TEST(geo_parametrizedline) diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index c4a3162b3..c561fc89d 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -218,10 +218,6 @@ template void mapQuaternion(void){ VERIFY_IS_APPROX(q1.coeffs(), q2.coeffs()); VERIFY_IS_APPROX(q1.coeffs(), q3.coeffs()); VERIFY_IS_APPROX(q4.coeffs(), q3.coeffs()); - #ifdef EIGEN_VECTORIZE - if(internal::packet_traits::Vectorizable) - VERIFY_RAISES_ASSERT((MQuaternionA(array3unaligned))); - #endif VERIFY_IS_APPROX(mq1 * (mq1.inverse() * v1), v1); VERIFY_IS_APPROX(mq1 * (mq1.conjugate() * v1), v1); @@ -281,10 +277,6 @@ template void quaternionAlignment(void){ VERIFY_IS_APPROX(q1->coeffs(), q2->coeffs()); VERIFY_IS_APPROX(q1->coeffs(), q3->coeffs()); - #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 - if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) - VERIFY_RAISES_ASSERT((::new(reinterpret_cast(arrayunaligned)) QuaternionA)); - #endif } template void check_const_correctness(const PlainObjectType&) diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index d433561cb..72c6edac1 100644 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -582,11 +582,6 @@ template void transform_alignment() VERIFY_IS_APPROX(p1->matrix(), p3->matrix()); VERIFY_IS_APPROX( (*p1) * (*p1), (*p2)*(*p3)); - - #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 - if(internal::packet_traits::Vectorizable) - VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Projective3a)); - #endif } template void transform_products() diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp deleted file mode 100644 index 120cc42bb..000000000 --- a/test/unalignedassert.cpp +++ /dev/null @@ -1,180 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008 Benoit Jacob -// Copyright (C) 2015 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_TEST_PART_1) - // default -#elif defined(EIGEN_TEST_PART_2) - #define EIGEN_MAX_STATIC_ALIGN_BYTES 16 - #define EIGEN_MAX_ALIGN_BYTES 16 -#elif defined(EIGEN_TEST_PART_3) - #define EIGEN_MAX_STATIC_ALIGN_BYTES 32 - #define EIGEN_MAX_ALIGN_BYTES 32 -#elif defined(EIGEN_TEST_PART_4) - #define EIGEN_MAX_STATIC_ALIGN_BYTES 64 - #define EIGEN_MAX_ALIGN_BYTES 64 -#endif - -#include "main.h" - -typedef Matrix Vector6f; -typedef Matrix Vector8f; -typedef Matrix Vector12f; - -typedef Matrix Vector5d; -typedef Matrix Vector6d; -typedef Matrix Vector7d; -typedef Matrix Vector8d; -typedef Matrix Vector9d; -typedef Matrix Vector10d; -typedef Matrix Vector12d; - -struct TestNew1 -{ - MatrixXd m; // good: m will allocate its own array, taking care of alignment. - TestNew1() : m(20,20) {} -}; - -struct TestNew2 -{ - Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned, - // 8-byte alignment is good enough here, which we'll get automatically -}; - -struct TestNew3 -{ - Vector2f m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned -}; - -struct TestNew4 -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW - Vector2d m; - float f; // make the struct have sizeof%16!=0 to make it a little more tricky when we allow an array of 2 such objects -}; - -struct TestNew5 -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW - float f; // try the f at first -- the EIGEN_ALIGN_MAX attribute of m should make that still work - Matrix4f m; -}; - -struct TestNew6 -{ - Matrix m; // good: no alignment requested - float f; -}; - -template struct Depends -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(Align) - Vector2d m; - float f; -}; - -template -void check_unalignedassert_good() -{ - T *x, *y; - x = new T; - delete x; - y = new T[2]; - delete[] y; -} - -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 -template -void construct_at_boundary(int boundary) -{ - char buf[sizeof(T)+256]; - size_t _buf = reinterpret_cast(buf); - _buf += (EIGEN_MAX_ALIGN_BYTES - (_buf % EIGEN_MAX_ALIGN_BYTES)); // make 16/32/...-byte aligned - _buf += boundary; // make exact boundary-aligned - T *x = ::new(reinterpret_cast(_buf)) T; - x[0].setZero(); // just in order to silence warnings - x->~T(); -} -#endif - -void unalignedassert() -{ -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 - construct_at_boundary(4); - construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(16); - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(16); -#endif - - check_unalignedassert_good(); - check_unalignedassert_good(); - check_unalignedassert_good(); - - check_unalignedassert_good(); - check_unalignedassert_good(); - check_unalignedassert_good(); - check_unalignedassert_good >(); - -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 - if(EIGEN_MAX_ALIGN_BYTES>=16) - { - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - // Complexes are disabled because the compiler might aggressively vectorize - // the initialization of complex coeffs to 0 before we can check for alignedness - //VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - } - for(int b=8; b(b)); - if(b<64) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - if(b<32) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - if(b<32) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - if(b<128) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - //if(b<32) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - } -#endif -} - -EIGEN_DECLARE_TEST(unalignedassert) -{ - CALL_SUBTEST(unalignedassert()); -} -- GitLab From 115591b9e37c0400c8a529728cda6215a76b27d6 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 18 Aug 2021 11:35:02 -0700 Subject: [PATCH 099/266] Workaround VS 2017 arg bug. In VS 2017, `std::arg` for real inputs always returns 0, even for negative inputs. It should return `PI` for negative real values. This seems to be fixed in VS 2019 (MSVC 1920). (cherry picked from commit 2b410ecbefea1bf4b9d50decb946a4ebe4a73f98) --- Eigen/src/Core/MathFunctions.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index d7ac4d64d..61b78f4f2 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -572,7 +572,9 @@ struct rint_retval * Implementation of arg * ****************************************************************************/ -#if EIGEN_HAS_CXX11_MATH +// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs. +// This seems to be fixed in VS 2019. +#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920) // std::arg is only defined for types of std::complex, or integer types or float/double/long double template::IsComplex || is_integral::value -- GitLab From 3147391d946bb4b6c68edd901f2add6ac1f31f8c Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 18 Aug 2021 13:41:58 -0700 Subject: [PATCH 100/266] Change version to 3.4.0. --- Eigen/src/Core/util/Macros.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 518c6c193..986c3d44d 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -16,8 +16,8 @@ //------------------------------------------------------------------------------------------ #define EIGEN_WORLD_VERSION 3 -#define EIGEN_MAJOR_VERSION 3 -#define EIGEN_MINOR_VERSION 91 +#define EIGEN_MAJOR_VERSION 4 +#define EIGEN_MINOR_VERSION 0 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ -- GitLab From 7aee90b8d38ec7f3b7ec7124b04216ce2e014550 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Mon, 30 Aug 2021 08:06:55 -0700 Subject: [PATCH 101/266] Fix fix when variable templates are not supported. There were some typos that checked `EIGEN_HAS_CXX14` that should have checked `EIGEN_HAS_CXX14_VARIABLE_TEMPLATES`, causing a mismatch in some of the `Eigen::fix` assumptions. Also fixed the `symbolic_index` test when `EIGEN_HAS_CXX14_VARIABLE_TEMPLATES` is 0. Fixes #2308 (cherry picked from commit 5db9e5c77958997856ddbccfa4a52ff22e83bef9) --- Eigen/src/Core/util/IntegralConstant.h | 6 +++--- test/symbolic_index.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index 945d426ea..e0092f654 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -138,7 +138,7 @@ template struct get_fixed_value,Default> { static const int value = N; }; -#if !EIGEN_HAS_CXX14 +#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES template struct get_fixed_value (*)(),Default> { static const int value = N; }; @@ -154,7 +154,7 @@ struct get_fixed_value,Default> { }; template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } -#if !EIGEN_HAS_CXX14 +#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } #endif @@ -166,7 +166,7 @@ template struct clea // Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index template struct cleanup_index_type::value>::type> { typedef Index type; }; -#if !EIGEN_HAS_CXX14 +#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES // In c++98/c++11, fix is a pointer to function that we better cleanup to a true FixedInt: template struct cleanup_index_type (*)(), DynamicKey> { typedef FixedInt type; }; #endif diff --git a/test/symbolic_index.cpp b/test/symbolic_index.cpp index b114cbb95..a75ca1165 100644 --- a/test/symbolic_index.cpp +++ b/test/symbolic_index.cpp @@ -58,15 +58,15 @@ void check_symbolic_index() VERIFY( is_same_type( fix<9>()/2, int(9/2) ) ); VERIFY( is_same_symb( lastp1-1, last, size) ); - VERIFY( is_same_symb( lastp1-fix<1>, last, size) ); + VERIFY( is_same_symb( lastp1-fix<1>(), last, size) ); VERIFY_IS_EQUAL( ( (last*5-2)/3 ).eval(last=size-1), ((size-1)*5-2)/3 ); - VERIFY_IS_EQUAL( ( (last*fix<5>-fix<2>)/fix<3> ).eval(last=size-1), ((size-1)*5-2)/3 ); + VERIFY_IS_EQUAL( ( (last*fix<5>()-fix<2>())/fix<3>() ).eval(last=size-1), ((size-1)*5-2)/3 ); VERIFY_IS_EQUAL( ( -last*lastp1 ).eval(last=size-1), -(size-1)*size ); VERIFY_IS_EQUAL( ( lastp1-3*last ).eval(last=size-1), size- 3*(size-1) ); VERIFY_IS_EQUAL( ( (lastp1-3*last)/lastp1 ).eval(last=size-1), (size- 3*(size-1))/size ); -#if EIGEN_HAS_CXX14 +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES { struct x_tag {}; static const symbolic::SymbolExpr x; struct y_tag {}; static const symbolic::SymbolExpr y; -- GitLab From 277d36906094efa00f7c69a930949f15f5a360cc Mon Sep 17 00:00:00 2001 From: Adam Kallai Date: Fri, 27 Aug 2021 16:39:24 +0200 Subject: [PATCH 102/266] win: include intrin header in Windows on ARM intrin header is needed for _BitScanReverse and _BitScanReverse64 (cherry picked from commit 1415817d8daa7fa72ec9b26a6b9d166a1d54626a) --- Eigen/Core | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/Core b/Eigen/Core index 5921e15f9..3c03519fe 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -109,7 +109,8 @@ #endif // required for __cpuid, needs to be included after cmath -#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE +// also required for _BitScanReverse on Windows on ARM +#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE #include #endif -- GitLab From c2b6df6e60a6b0e59ba4a07fd2895cf2a1a74be4 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 31 Aug 2021 08:33:19 -0700 Subject: [PATCH 103/266] Disable cuda Eigen::half vectorization on host. All cuda `__half` functions are device-only in CUDA 9, including conversions. Host-side conversions were added in CUDA 10. The existing code doesn't build prior to 10.0. All arithmetic functions are always device-only, so there's therefore no reason to use vectorization on the host at all. Modified the code to disable vectorization for `__half` on host, which required also updating the `TensorReductionGpu` implementation which previously made assumptions about available packets. (cherry picked from commit cc3573ab4451853774cd5c3497373d5fe8914774) --- Eigen/src/Core/arch/GPU/PacketMath.h | 220 ++++++++---------- Eigen/src/Core/arch/GPU/TypeCasting.h | 3 +- .../Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- .../CXX11/src/Tensor/TensorReductionGpu.h | 57 +++-- 4 files changed, 126 insertions(+), 156 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index 689110ded..25c45fd35 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -493,9 +493,10 @@ ptranspose(PacketBlock& kernel) { #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) -// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning -// its corresponding packet_traits must be visible on host. -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +// Half-packet functions are not available on the host for CUDA 9.0-9.2, only +// on device. There is no benefit to using them on the host anyways, since they are +// emulated. +#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) typedef ulonglong2 Packet4h2; template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; }; @@ -526,42 +527,9 @@ template<> struct packet_traits : default_packet_traits }; }; -namespace { -// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist. -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __halves2half2(a, b); -#else - // Round-about way since __halves2half2 is a __device__ function. - return __floats2half2_rn(__half2float(a), __half2float(b)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __low2half(a); -#else - return __float2half(__low2float(a)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __high2half(a); -#else - return __float2half(__high2float(a)); -#endif -} -} // namespace - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { -#if defined(EIGEN_GPU_COMPILE_PHASE) return __half2half2(from); -#else - const float f = __half2float(from); - return __floats2half2_rn(f, f); -#endif } template <> @@ -576,8 +544,6 @@ pset1(const Eigen::half& from) { return r; } -// We now need this visible on both host and device. -// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { @@ -585,11 +551,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return combine_half(from[0], from[1]); + return __halves2half2(from[0], from[1]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return combine_half(from[0], from[0]); + return __halves2half2(from[0], from[0]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, @@ -599,8 +565,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { - to[0] = get_half2_low(from); - to[1] = get_half2_high(from); + to[0] = __low2half(from); + to[1] = __high2half(from); } @@ -610,7 +576,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( // Input is guaranteed to be properly aligned. return __ldg(reinterpret_cast(from)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } @@ -619,31 +585,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( #if defined(EIGEN_GPU_HAS_LDG) return __halves2half2(__ldg(from+0), __ldg(from+1)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return combine_half(from[0*stride], from[1*stride]); + return __halves2half2(from[0*stride], from[1*stride]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = get_half2_low(from); - to[stride*1] = get_half2_high(from); + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return get_half2_low(a); + return __low2half(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); + half a1 = __low2half(a); + half a2 = __high2half(a); half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) { @@ -658,12 +624,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - __half a1 = get_half2_low(kernel.packet[0]); - __half a2 = get_half2_high(kernel.packet[0]); - __half b1 = get_half2_low(kernel.packet[1]); - __half b2 = get_half2_high(kernel.packet[1]); - kernel.packet[0] = combine_half(a1, b1); - kernel.packet[1] = combine_half(a2, b2); + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { @@ -671,88 +637,88 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; - return combine_half(a, __float2half(f)); + return __halves2half2(a, __float2half(f)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) { - half mask_low = get_half2_low(mask); - half mask_high = get_half2_high(mask); - half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a); - half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a); - return combine_half(result_low, result_high); + half mask_low = __low2half(mask); + half mask_high = __high2half(mask); + half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a); + half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a); + return __halves2half2(result_low, result_high); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, @@ -851,9 +817,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, @@ -862,9 +828,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { @@ -885,7 +851,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 > a2 ? get_half2_low(a) : get_half2_high(a); + return a1 > a2 ? __low2half(a) : __high2half(a); #endif } @@ -897,7 +863,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 < a2 ? get_half2_low(a) : get_half2_high(a); + return a1 < a2 ? __low2half(a) : __high2half(a); #endif } @@ -1068,10 +1034,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather(const Eigen::half* from, Index stride) { Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(from[0 * stride], from[1 * stride]); - p_alias[1] = combine_half(from[2 * stride], from[3 * stride]); - p_alias[2] = combine_half(from[4 * stride], from[5 * stride]); - p_alias[3] = combine_half(from[6 * stride], from[7 * stride]); + p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]); + p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]); + p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]); + p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]); return r; } @@ -1152,12 +1118,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) { - __half a1 = get_half2_low(f0); - __half a2 = get_half2_high(f0); - __half b1 = get_half2_low(f1); - __half b2 = get_half2_high(f1); - f0 = combine_half(a1, b1); - f1 = combine_half(a2, b2); + __half a1 = __low2half(f0); + __half a2 = __high2half(f0); + __half b1 = __low2half(f1); + __half b2 = __high2half(f1); + f0 = __halves2half2(a1, b1); + f1 = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void @@ -1254,10 +1220,10 @@ plset(const Eigen::half& a) { float f = __half2float(a); Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(a, __float2half(f + 1.0f)); - p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f)); - p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f)); - p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f)); + p_alias[0] = __halves2half2(a, __float2half(f + 1.0f)); + p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f)); + p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f)); + p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f)); return r; #endif } @@ -1477,9 +1443,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_max(a_alias[0]), + half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1])); - half2 m1 = combine_half(predux_max(a_alias[2]), + half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3])); __half first = predux_max(m0); __half second = predux_max(m1); @@ -1496,9 +1462,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_min(a_alias[0]), + half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1])); - half2 m1 = combine_half(predux_min(a_alias[2]), + half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3])); __half first = predux_min(m0); __half second = predux_min(m1); @@ -1652,9 +1618,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } template<> @@ -1664,14 +1630,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } -// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) - -#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) #undef EIGEN_GPU_HAS_LDG #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h index 754546225..c8195bb2b 100644 --- a/Eigen/src/Core/arch/GPU/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -15,8 +15,7 @@ namespace Eigen { namespace internal { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) template <> struct type_casting_traits { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index a6181d35e..b90a1dcd6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE) typedef ulonglong2 Packet4h2; template<> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index db4e8d866..315ccc172 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -98,6 +98,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) { } } } +#ifdef EIGEN_GPU_COMPILE_PHASE // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations template __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) { @@ -107,6 +108,7 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc atomicReduce(houtput+i,*(haccum+i),reducer); } } +#endif // EIGEN_GPU_COMPILE_PHASE #endif // EIGEN_HAS_GPU_FP16 template <> @@ -213,8 +215,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer #ifdef EIGEN_HAS_GPU_FP16 template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, - packet_traits::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat( + Reducer reducer, const Self input, Index num_coeffs, half* scratch) { eigen_assert(blockDim.x == 1); eigen_assert(gridDim.x == 1); typedef packet_traits::type packet_type; @@ -224,15 +226,16 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFlo half2* h2scratch = reinterpret_cast(scratch); for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) { *h2scratch = - __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1)); + __halves2half2(input.coeff(i), input.coeff(i + 1)); h2scratch++; } if ((num_coeffs & 1) != 0) { - half lastCoeff = input.m_impl.coeff(num_coeffs - 1); + half lastCoeff = input.coeff(num_coeffs - 1); *h2scratch = __halves2half2(lastCoeff, reducer.initialize()); } } else { - *scratch = reducer.template initializePacket(); + packet_type reduce = reducer.template initializePacket(); + internal::pstoreu(scratch, reduce); } } @@ -258,8 +261,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reduce template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, - half* output, packet_traits::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat( + Reducer reducer, const Self input, Index num_coeffs, + half* output, half* scratch) { typedef typename packet_traits::type PacketType; const int packet_width = unpacket_traits::size; eigen_assert(NumPerThread % packet_width == 0); @@ -273,19 +277,20 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce int rem = num_coeffs % packet_width; if (rem != 0) { half2* p_scratch = reinterpret_cast(scratch); - *scratch = reducer.template initializePacket(); + pstoreu(scratch, reducer.template initializePacket()); for (int i = 0; i < rem / 2; i++) { *p_scratch = __halves2half2( - input.m_impl.coeff(num_coeffs - packet_width + 2 * i), - input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1)); + input.coeff(num_coeffs - packet_width + 2 * i), + input.coeff(num_coeffs - packet_width + 2 * i + 1)); p_scratch++; } if ((num_coeffs & 1) != 0) { - half last = input.m_impl.coeff(num_coeffs - 1); + half last = input.coeff(num_coeffs - 1); *p_scratch = __halves2half2(last, reducer.initialize()); } } else { - *scratch = reducer.template initializePacket(); + PacketType reduce = reducer.template initializePacket(); + pstoreu(scratch, reduce); } } __syncthreads(); @@ -298,7 +303,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce for (Index i = 0; i < max_iter; i += BlockSize) { const Index index = first_index + packet_width * i; eigen_assert(index + packet_width < num_coeffs); - PacketType val = input.m_impl.template packet(index); + PacketType val = input.template packet(index); reducer.reducePacket(val, &accum); } @@ -337,7 +342,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce } if ((threadIdx.x & (warpSize - 1)) == 0) { - atomicReduce(scratch, accum, reducer); + atomicReduce(reinterpret_cast(scratch), accum, reducer); } __syncthreads(); @@ -357,17 +362,21 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce } template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) { eigen_assert(threadIdx.x == 1); - half2* pscratch = reinterpret_cast(scratch); - half tmp = __float2half(0.f); typedef packet_traits::type packet_type; - for (int i = 0; i < unpacket_traits::size; i += 2) { - reducer.reduce(__low2half(*pscratch), &tmp); - reducer.reduce(__high2half(*pscratch), &tmp); - pscratch++; + if (unpacket_traits::size == 1) { + *output = *scratch; + } else { + half2* pscratch = reinterpret_cast(scratch); + half tmp = __float2half(0.f); + for (int i = 0; i < unpacket_traits::size; i += 2) { + reducer.reduce(__low2half(*pscratch), &tmp); + reducer.reduce(__high2half(*pscratch), &tmp); + pscratch++; + } + *output = tmp; } - *output = tmp; } #endif // EIGEN_HAS_GPU_FP16 @@ -416,13 +425,11 @@ template struct FullReductionLauncher { static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; - typedef typename packet_traits::type PacketType; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup(num_coeffs, block_size * num_per_thread); - PacketType* scratch = static_cast(device.scratchpad()); - // half2* scratch = static_cast(device.scratchpad()); + half* scratch = static_cast(device.scratchpad()); if (num_blocks > 1) { // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there -- GitLab From 4ef67cbfb25df4a9a72315ac67db74d5bdee5fde Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 31 Aug 2021 12:01:53 -0700 Subject: [PATCH 104/266] GCC 4.8 arm EIGEN_OPTIMIZATION_BARRIER fix (#2315). GCC 4.8 doesn't seem to like the `g` register constraint, failing to compile with "error: 'asm' operand requires impossible reload". Tested `r` instead, and that seems to work, even with latest compilers. Also fixed some minor macro issues to eliminate warnings on armv7. Fixes #2315. (cherry picked from commit ff07a8a63945d89301d1b29ac59d170ff9be3955) --- Eigen/src/Core/arch/NEON/PacketMath.h | 6 +++--- Eigen/src/Core/util/Macros.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index d2aeef430..6996cc8d3 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -155,7 +155,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif EIGEN_ARCH_ARM32 +#elif EIGEN_ARCH_ARM #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching @@ -3918,8 +3918,6 @@ template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); } -#endif // EIGEN_ARCH_ARM64 - // Do we have an fp16 types and supporting Neon intrinsics? #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC typedef float16x4_t Packet4hf; @@ -4580,6 +4578,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& } #endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC +#endif // EIGEN_ARCH_ARM64 + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 986c3d44d..b74c99adf 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1131,7 +1131,7 @@ namespace Eigen { #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); #elif EIGEN_ARCH_ARM_OR_ARM64 // General, NEON. - #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,w" (X)); #elif EIGEN_ARCH_i386_OR_x86_64 // General, SSE. #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X)); -- GitLab From 07cc36223818d1a3568c7dee3211480c7c26ff2e Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 1 Sep 2021 09:17:46 -0700 Subject: [PATCH 105/266] Fix EIGEN_OPTIMIZATION_BARRIER for arm-clang. Clang doesn't like !621, needs the "g" constraint back. The "g" constraint also works for GCC >= 5. This fixes our gitlab CI. (cherry picked from commit 3a6296d4f198ffbcccda4303919b3b14d5e54524) --- Eigen/src/Core/util/Macros.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index b74c99adf..085ab3fa8 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1131,7 +1131,16 @@ namespace Eigen { #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); #elif EIGEN_ARCH_ARM_OR_ARM64 // General, NEON. - #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,w" (X)); + // Clang doesn't like "r", + // error: non-trivial scalar-to-vector conversion, possible invalid + // constraint for vector type + // GCC < 5 doesn't like "g", + // error: 'asm' operand requires impossible reload + #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(5, 0) + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,w" (X)); + #else + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #endif #elif EIGEN_ARCH_i386_OR_x86_64 // General, SSE. #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X)); -- GitLab From 0fdc99c65e59bda015e4996768f0959ce5928c48 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 30 Aug 2021 12:26:39 -0700 Subject: [PATCH 106/266] Allow old Fortran code for LAPACK tests to compile despite argument mismatch errors (REAL passed to COMPLEX workspace argument) with GNU Fortran 10. (cherry picked from commit 7e096ddcb09d560a846b119691e48651e74ee677) --- lapack/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index e48497fda..4fc7fe195 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -5,6 +5,13 @@ include(CheckLanguage) check_language(Fortran) if(CMAKE_Fortran_COMPILER) enable_language(Fortran) + if("${CMAKE_Fortran_COMPILER_ID}" STREQUAL "GNU") + if ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER_EQUAL 10.0) + # We use an old version of LAPACK with argument type mismatches. + # Allow them to compile anyway with newer GNU versions. + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fallow-argument-mismatch") + endif() + endif() set(EIGEN_Fortran_COMPILER_WORKS ON) else() set(EIGEN_Fortran_COMPILER_WORKS OFF) -- GitLab From 9263475740e974312f98024fc457c66317b8cc0f Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 30 Aug 2021 21:49:08 +0000 Subject: [PATCH 107/266] Add missing dependency on LAPACK test suite binaries to target `buildtests`, so `make check` will work correctly when `EIGEN_ENABLE_LAPACK_TESTS` is `ON`. (cherry picked from commit 6f429a202dc01e4e88d423d65bd3aa81a7af7d64) --- lapack/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 4fc7fe195..c8ca64001 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -1,4 +1,3 @@ - project(EigenLapack CXX) include(CheckLanguage) @@ -152,6 +151,7 @@ if(EXISTS ${eigen_full_path_to_testing_lapack}) string(REPLACE "." "_" input_name ${input}) set(testName "${target}_${input_name}") if(EXISTS "${TEST_INPUT}") + add_dependencies(buildtests ${target}) add_test(NAME LAPACK-${testName} COMMAND "${CMAKE_COMMAND}" -DTEST=$ -- GitLab From b8cf1ed753db6260c1f8cd9c58bdb8db6e255049 Mon Sep 17 00:00:00 2001 From: "Maxiwell S. Garcia" Date: Wed, 1 Sep 2021 08:32:49 -0500 Subject: [PATCH 108/266] Rename 'vec_all_nan' of cxx11_tensor_expr test because this symbol is used by altivec.h (cherry picked from commit 09fc0f97b53e22d8fef94acf0fbfeed3717ab906) --- unsupported/test/cxx11_tensor_expr.cpp | 52 +++++++++++++------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 169fc1898..27c284514 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -305,10 +305,10 @@ void test_minmax_nan_propagation_templ() { const Scalar kNaN = std::numeric_limits::quiet_NaN(); const Scalar kInf = std::numeric_limits::infinity(); const Scalar kZero(0); - Tensor vec_all_nan(size); + Tensor vec_full_nan(size); Tensor vec_one_nan(size); Tensor vec_zero(size); - vec_all_nan.setConstant(kNaN); + vec_full_nan.setConstant(kNaN); vec_zero.setZero(); vec_one_nan.setZero(); vec_one_nan(size/2) = kNaN; @@ -330,12 +330,12 @@ void test_minmax_nan_propagation_templ() { // max(nan, 0) = nan // max(0, nan) = nan // max(0, 0) = 0 - verify_all_nan(vec_all_nan.template cwiseMax(kNaN)); - verify_all_nan(vec_all_nan.template cwiseMax(vec_all_nan)); - verify_all_nan(vec_all_nan.template cwiseMax(kZero)); - verify_all_nan(vec_all_nan.template cwiseMax(vec_zero)); + verify_all_nan(vec_full_nan.template cwiseMax(kNaN)); + verify_all_nan(vec_full_nan.template cwiseMax(vec_full_nan)); + verify_all_nan(vec_full_nan.template cwiseMax(kZero)); + verify_all_nan(vec_full_nan.template cwiseMax(vec_zero)); verify_all_nan(vec_zero.template cwiseMax(kNaN)); - verify_all_nan(vec_zero.template cwiseMax(vec_all_nan)); + verify_all_nan(vec_zero.template cwiseMax(vec_full_nan)); verify_all_zero(vec_zero.template cwiseMax(kZero)); verify_all_zero(vec_zero.template cwiseMax(vec_zero)); @@ -344,12 +344,12 @@ void test_minmax_nan_propagation_templ() { // max(nan, 0) = 0 // max(0, nan) = 0 // max(0, 0) = 0 - verify_all_nan(vec_all_nan.template cwiseMax(kNaN)); - verify_all_nan(vec_all_nan.template cwiseMax(vec_all_nan)); - verify_all_zero(vec_all_nan.template cwiseMax(kZero)); - verify_all_zero(vec_all_nan.template cwiseMax(vec_zero)); + verify_all_nan(vec_full_nan.template cwiseMax(kNaN)); + verify_all_nan(vec_full_nan.template cwiseMax(vec_full_nan)); + verify_all_zero(vec_full_nan.template cwiseMax(kZero)); + verify_all_zero(vec_full_nan.template cwiseMax(vec_zero)); verify_all_zero(vec_zero.template cwiseMax(kNaN)); - verify_all_zero(vec_zero.template cwiseMax(vec_all_nan)); + verify_all_zero(vec_zero.template cwiseMax(vec_full_nan)); verify_all_zero(vec_zero.template cwiseMax(kZero)); verify_all_zero(vec_zero.template cwiseMax(vec_zero)); @@ -358,12 +358,12 @@ void test_minmax_nan_propagation_templ() { // min(nan, 0) = nan // min(0, nan) = nan // min(0, 0) = 0 - verify_all_nan(vec_all_nan.template cwiseMin(kNaN)); - verify_all_nan(vec_all_nan.template cwiseMin(vec_all_nan)); - verify_all_nan(vec_all_nan.template cwiseMin(kZero)); - verify_all_nan(vec_all_nan.template cwiseMin(vec_zero)); + verify_all_nan(vec_full_nan.template cwiseMin(kNaN)); + verify_all_nan(vec_full_nan.template cwiseMin(vec_full_nan)); + verify_all_nan(vec_full_nan.template cwiseMin(kZero)); + verify_all_nan(vec_full_nan.template cwiseMin(vec_zero)); verify_all_nan(vec_zero.template cwiseMin(kNaN)); - verify_all_nan(vec_zero.template cwiseMin(vec_all_nan)); + verify_all_nan(vec_zero.template cwiseMin(vec_full_nan)); verify_all_zero(vec_zero.template cwiseMin(kZero)); verify_all_zero(vec_zero.template cwiseMin(vec_zero)); @@ -372,12 +372,12 @@ void test_minmax_nan_propagation_templ() { // min(nan, 0) = 0 // min(0, nan) = 0 // min(0, 0) = 0 - verify_all_nan(vec_all_nan.template cwiseMin(kNaN)); - verify_all_nan(vec_all_nan.template cwiseMin(vec_all_nan)); - verify_all_zero(vec_all_nan.template cwiseMin(kZero)); - verify_all_zero(vec_all_nan.template cwiseMin(vec_zero)); + verify_all_nan(vec_full_nan.template cwiseMin(kNaN)); + verify_all_nan(vec_full_nan.template cwiseMin(vec_full_nan)); + verify_all_zero(vec_full_nan.template cwiseMin(kZero)); + verify_all_zero(vec_full_nan.template cwiseMin(vec_zero)); verify_all_zero(vec_zero.template cwiseMin(kNaN)); - verify_all_zero(vec_zero.template cwiseMin(vec_all_nan)); + verify_all_zero(vec_zero.template cwiseMin(vec_full_nan)); verify_all_zero(vec_zero.template cwiseMin(kZero)); verify_all_zero(vec_zero.template cwiseMin(vec_zero)); @@ -397,13 +397,13 @@ void test_minmax_nan_propagation_templ() { VERIFY_IS_EQUAL(val(), kZero); // Test NaN propagation for tensor of all NaNs. - val = vec_all_nan.template minimum(); + val = vec_full_nan.template minimum(); VERIFY((numext::isnan)(val())); - val = vec_all_nan.template minimum(); + val = vec_full_nan.template minimum(); VERIFY_IS_EQUAL(val(), kInf); - val = vec_all_nan.template maximum(); + val = vec_full_nan.template maximum(); VERIFY((numext::isnan)(val())); - val = vec_all_nan.template maximum(); + val = vec_full_nan.template maximum(); VERIFY_IS_EQUAL(val(), -kInf); // Test NaN propagation for tensor with a single NaN. -- GitLab From f03d3e7072e3d7c56b1b1bbb490685a254c56633 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 31 Aug 2021 08:32:21 -0700 Subject: [PATCH 109/266] Missing EIGEN_DEVICE_FUNCs to get `gpu_basic` passing with CUDA 9. CUDA 9 seems to require labelling defaulted constructors as `EIGEN_DEVICE_FUNC`, despite giving warnings that such labels are ignored. Without these labels, the `gpu_basic` test fails to compile, with errors about calling `__host__` functions from `__host__ __device__` functions. (cherry picked from commit 998bab4b04f26552b9875acfe113e69c7adccec4) --- Eigen/src/Core/Block.h | 8 ++++---- Eigen/src/Core/util/Macros.h | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 3206d6633..d0b95d50b 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -260,19 +260,19 @@ template - inline PacketScalar packet(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const { return m_xpr.template packet(rowId + m_startRow.value(), colId + m_startCol.value()); } template - inline void writePacket(Index rowId, Index colId, const PacketScalar& val) + EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { m_xpr.template writePacket(rowId + m_startRow.value(), colId + m_startCol.value(), val); } template - inline PacketScalar packet(Index index) const + EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const { return m_xpr.template packet (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), @@ -280,7 +280,7 @@ template - inline void writePacket(Index index, const PacketScalar& val) + EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val) { m_xpr.template writePacket (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 085ab3fa8..b436dfad3 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1225,7 +1225,7 @@ namespace Eigen { * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden. */ #if EIGEN_HAS_CXX11 -#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default; +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default; #else #define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) #endif @@ -1250,12 +1250,12 @@ namespace Eigen { */ #if EIGEN_HAS_CXX11 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - Derived() = default; \ - ~Derived() = default; + EIGEN_DEVICE_FUNC Derived() = default; \ + EIGEN_DEVICE_FUNC ~Derived() = default; #else #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - Derived() {}; \ - /* ~Derived() {}; */ + EIGEN_DEVICE_FUNC Derived() {}; \ + /* EIGEN_DEVICE_FUNC ~Derived() {}; */ #endif -- GitLab From 3395f4e604bde4006ff963aeeab271f11f3c2af7 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 2 Sep 2021 12:23:27 -0700 Subject: [PATCH 110/266] Fix tridiagonalization_inplace_selector. The `Options` of the new `hCoeffs` vector do not necessarily match those of the `MatrixType`, leading to build errors. Having the `CoeffVectorType` be a template parameter relieves this restriction. (cherry picked from commit ebd4b17d2f5ca29a5c16ebd35d54d7aeda587820) --- Eigen/src/Eigenvalues/Tridiagonalization.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index 674c92a39..eda82794a 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -440,9 +440,8 @@ void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonal template struct tridiagonalization_inplace_selector { - typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; - template + template static EIGEN_DEVICE_FUNC void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ) { -- GitLab From 3335e0767cb847154e24f5d4fa345318309d1281 Mon Sep 17 00:00:00 2001 From: Ryan Pavlik Date: Wed, 8 Sep 2021 14:58:14 -0500 Subject: [PATCH 111/266] Fix typos in copyright dates --- test/ref.cpp | 2 +- test/sparse_ref.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/ref.cpp b/test/ref.cpp index ebfc70d3d..63eb65e27 100644 --- a/test/ref.cpp +++ b/test/ref.cpp @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 20013 Gael Guennebaud +// Copyright (C) 2013 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp index 12b6f8a9d..8f33af858 100644 --- a/test/sparse_ref.cpp +++ b/test/sparse_ref.cpp @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 20015 Gael Guennebaud +// Copyright (C) 2015 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed -- GitLab From f046e326d9e30772725d8fb26dc33328e418d9d3 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 17 Sep 2021 12:49:01 -0700 Subject: [PATCH 112/266] Fix strict aliasing bug causing product_small failure. Packet loading is skipped due to aliasing violation, leading to nullopt matrix multiplication. Fixes #2327. (cherry picked from commit 3c724c44cff3f9e2e9e35351abff0b5c022b320d) --- Eigen/src/Core/arch/AVX/Complex.h | 4 +++- Eigen/src/Core/arch/AVX512/Complex.h | 4 +++- Eigen/src/Core/arch/SSE/Complex.h | 11 +++-------- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index ab7bd6c65..e9096c0a1 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -99,7 +99,9 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) { - return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); + const float re = std::real(from); + const float im = std::imag(from); + return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 49c72b3f1..074253859 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -97,7 +97,9 @@ template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) { - return Packet8cf(_mm512_castpd_ps(pload1((const double*)(const void*)&from))); + const float re = std::real(from); + const float im = std::imag(from); + return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index 8fe22da46..215bfd7bb 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -106,14 +106,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - Packet2cf res; -#ifdef EIGEN_VECTORIZE_SSE3 - res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast(&from))); -#else - res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&from))); - res.v = _mm_movelh_ps(res.v, res.v); -#endif - return res; + const float re = std::real(from); + const float im = std::imag(from); + return Packet2cf(_mm_set_ps(im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } -- GitLab From 929bc0e191d0927b1735b9a1ddc0e8b77e3a25ec Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 20 Sep 2021 10:37:50 +0200 Subject: [PATCH 113/266] Fix alias violation in BFloat16 reinterpret_cast between unrelated types is undefined behavior and leads to misoptimizations on some platforms. Use the safer (and faster) version via bit_cast (cherry picked from commit b5eaa4269503f77d0aa58d2f8ed9419e1ba7784d) --- Eigen/src/Core/arch/Default/BFloat16.h | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h index 1c28f4f95..f21d1a0a3 100644 --- a/Eigen/src/Core/arch/Default/BFloat16.h +++ b/Eigen/src/Core/arch/Default/BFloat16.h @@ -251,12 +251,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const output.value = std::signbit(v) ? 0xFFC0: 0x7FC0; return output; } - const uint16_t* p = reinterpret_cast(&v); -#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - output.value = p[0]; -#else - output.value = p[1]; -#endif + output.value = static_cast(numext::bit_cast(v) >> 16); return output; } @@ -462,14 +457,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(&result); -#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - q[0] = h.value; -#else - q[1] = h.value; -#endif - return result; + return numext::bit_cast(static_cast(h.value) << 16); } // --- standard functions --- -- GitLab From a8eb797a43c3b9e1702f3031d91060d145b539d3 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 16 Sep 2021 16:16:47 -0700 Subject: [PATCH 114/266] Remove -fabi-version=6 flag from AVX512 builds. It was added to fix builds with gcc 4.9, but these don't even work today, and the flag breaks compilation with newer versions of gcc. (cherry picked from commit 1239adfcab1647482329a1c52396e52fca19f893) --- CMakeLists.txt | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f3e69b845..0d1ead92f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -253,18 +253,12 @@ if(NOT MSVC) option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) if(EIGEN_TEST_AVX512) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma") - if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") - endif() message(STATUS "Enabling AVX512 in tests/examples") endif() option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF) if(EIGEN_TEST_AVX512DQ) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq") - if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") - endif() message(STATUS "Enabling AVX512DQ in tests/examples") endif() -- GitLab From ebd5c6d44b0e5fb837acb6d41569be81fb250168 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 22 Sep 2021 14:06:06 -0700 Subject: [PATCH 115/266] Add -mfma for AVX512DQ tests. (cherry picked from commit 76bb29c0c2e0948ca02af686a2b5ba4be6afcccc) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0d1ead92f..f40cf7738 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -258,7 +258,7 @@ if(NOT MSVC) option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF) if(EIGEN_TEST_AVX512DQ) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq -mfma") message(STATUS "Enabling AVX512DQ in tests/examples") endif() -- GitLab From 71498b32c92c0326452aff0a57a4d20d5cad73c1 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 23 Sep 2021 10:49:08 -0700 Subject: [PATCH 116/266] Disable more NVCC warnings. The 2979 warning is yet another "calling a __host__ function from a __host__ device__ function. Although we probably should eventually address these, they are flooding the logs. Most of these are harmless since we only call the original from the host. In cases where these are actually called from device, an error is generated instead anyways. The 2977 warning is a bit strange - although the warning suggests the `__device__` annotation is ignored, this doesn't actually seem to be the case. Without the `__device__` declarations, the kernel actually fails to run when attempting to construct such objects. Again, these warnings are flooding the logs, so disabling for now. (cherry picked from commit 86c0decc480147d109b1dd8b968bcbc509b7a2e6) --- Eigen/src/Core/util/DisableStupidWarnings.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index fe0cfec0b..2f34a2e05 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -93,6 +93,12 @@ #pragma diag_suppress 2735 #pragma diag_suppress 2737 #pragma diag_suppress 2739 + #pragma diag_suppress 2979 + // Disable the "// __device__ annotation is ignored on a function(...) that is + // explicitly defaulted on its first declaration" message. + // The __device__ annotation seems to actually be needed in some cases, + // otherwise resulting in kernel runtime errors. + #pragma diag_suppress 2977 #endif #else -- GitLab From 7ea4adb5f07ee762fd9d8c3a9cde8edf4e6dce44 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 23 Sep 2021 13:43:18 -0700 Subject: [PATCH 117/266] Disable another device warning (cherry picked from commit e9e90892fecb4bebe6473e9de491bfcd6c0de37f) --- Eigen/src/Core/util/DisableStupidWarnings.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 2f34a2e05..3def95f47 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -93,6 +93,7 @@ #pragma diag_suppress 2735 #pragma diag_suppress 2737 #pragma diag_suppress 2739 + #pragma diag_suppress 2976 #pragma diag_suppress 2979 // Disable the "// __device__ annotation is ignored on a function(...) that is // explicitly defaulted on its first declaration" message. -- GitLab From 943ef50a2d5f0e22ca0c5d25d37caa5d2e81bd1d Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Mon, 27 Sep 2021 15:03:24 -0700 Subject: [PATCH 118/266] Disable testing of complex compound assignment operators for MSVC. MSVC does not support specializing compound assignments for `std::complex`, since it already specializes them (contrary to the standard). Trying to use one of these on device will currently lead to a duplicate definition error. This is still probably preferable to no error though. If we remove the definitions for MSVC, then it will compile, but the kernel will fail silently. The only proper solution would be to define our own custom `Complex` type. (cherry picked from commit f0f1d7938b7083800ff75fe88e15092f08a4e67e) --- Eigen/src/Core/arch/CUDA/Complex.h | 13 ++++++++++++- test/gpu_basic.cu | 4 ++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h index deb4c8694..45f6ddb94 100644 --- a/Eigen/src/Core/arch/CUDA/Complex.h +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -11,13 +11,24 @@ #ifndef EIGEN_COMPLEX_CUDA_H #define EIGEN_COMPLEX_CUDA_H -// clang-format off // Many std::complex methods such as operator+, operator-, operator* and // operator/ are not constexpr. Due to this, GCC and older versions of clang do // not treat them as device functions and thus Eigen functors making use of // these operators fail to compile. Here, we manually specialize these // operators and functors for complex types when building for CUDA to enable // their use on-device. +// +// NOTES: +// - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device, +// since they are already specialized in the standard. Using them will result +// in silent kernel failures. +// - Compiling with MSVC and using +=,-=,*=,/=(std::complex) will lead +// to duplicate definition errors, since these are already specialized in +// Visual Studio's header (contrary to the standard). This is +// preferable to removing such definitions, which will lead to silent kernel +// failures. +// - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior +// to the first inclusion of . #if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE) diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu index 4298da3bb..e424a93c9 100644 --- a/test/gpu_basic.cu +++ b/test/gpu_basic.cu @@ -138,10 +138,12 @@ struct complex_operators { out[out_idx++] = a / numext::real(b); out[out_idx++] = numext::real(a) / b; +#if !defined(EIGEN_COMP_MSVC) out[out_idx] = a; out[out_idx++] += b; out[out_idx] = a; out[out_idx++] -= b; out[out_idx] = a; out[out_idx++] *= b; out[out_idx] = a; out[out_idx++] /= b; +#endif const ComplexType true_value = ComplexType(ValueType(1), ValueType(0)); const ComplexType false_value = ComplexType(ValueType(0), ValueType(0)); @@ -188,6 +190,7 @@ struct complex_operators { res.segment(block_idx, size) = x1.real().array() / x2.array(); block_idx += size; +#if !defined(EIGEN_COMP_MSVC) res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2; block_idx += size; res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2; @@ -196,6 +199,7 @@ struct complex_operators { block_idx += size; res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array(); block_idx += size; +#endif const T true_vector = T::Constant(true_value); const T false_vector = T::Constant(false_value); -- GitLab From 05c9d7ce201bfcbe7a2706bef4f893038ae251e5 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 3 Sep 2021 11:07:18 -0700 Subject: [PATCH 119/266] Disable MSVC constant condition warning. We use extensive use of `if (CONSTANT)`, and cannot use c++17's `if constexpr`. (cherry picked from commit 5bf35383e073d218be7a87bdca434be30d231e7e) --- Eigen/src/Core/util/DisableStupidWarnings.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index 3def95f47..e950749e7 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -4,6 +4,7 @@ #ifdef _MSC_VER // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p)) // 4101 - unreferenced local variable + // 4127 - conditional expression is constant // 4181 - qualifier applied to reference type ignored // 4211 - nonstandard extension used : redefined extern to static // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data @@ -19,7 +20,7 @@ #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) + #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) -- GitLab From 5d918b82a80118ebb19572770a0c8e1f5fe06b91 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Thu, 21 Oct 2021 13:48:50 -0700 Subject: [PATCH 120/266] Add nan-propagation options to matrix and array plugins. --- Eigen/src/plugins/ArrayCwiseBinaryOps.h | 90 ++++++++++++++++++++++-- Eigen/src/plugins/MatrixCwiseBinaryOps.h | 48 ++++++++++--- test/array_for_matrix.cpp | 25 +++++++ 3 files changed, 148 insertions(+), 15 deletions(-) diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 0e5d5445b..a9ddb9058 100644 --- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -30,15 +30,53 @@ operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const * * \sa max() */ -EIGEN_MAKE_CWISE_BINARY_OP(min,min) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const OtherDerived &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const OtherDerived &other) const +{ + return (min)(other); +} /** \returns an expression of the coefficient-wise min of \c *this and scalar \a other * * \sa max() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, - const CwiseNullaryOp, PlainObject> > +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const Scalar &other) const +{ + return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); +} + +EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > #ifdef EIGEN_PARSED_BY_DOXYGEN min #else @@ -46,7 +84,7 @@ min #endif (const Scalar &other) const { - return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); + return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise max of \c *this and \a other @@ -56,14 +94,52 @@ min * * \sa min() */ -EIGEN_MAKE_CWISE_BINARY_OP(max,max) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const OtherDerived &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const OtherDerived &other) const +{ + return (max)(other); +} /** \returns an expression of the coefficient-wise max of \c *this and scalar \a other * * \sa min() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const Scalar &other) const +{ + return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); +} + EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const CwiseNullaryOp, PlainObject> > #ifdef EIGEN_PARSED_BY_DOXYGEN max @@ -72,7 +148,7 @@ max #endif (const Scalar &other) const { - return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); + return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise absdiff of \c *this and \a other diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h index a0feef871..514d83a71 100644 --- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h +++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h @@ -72,23 +72,39 @@ cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const * * \sa class CwiseBinaryOp, max() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return cwiseMin(other); } /** \returns an expression of the coefficient-wise min of *this and scalar \a other * * \sa class CwiseBinaryOp, min() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMin(const Scalar &other) const +{ + return cwiseMin(Derived::Constant(rows(), cols(), other)); +} + EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> cwiseMin(const Scalar &other) const { - return cwiseMin(Derived::Constant(rows(), cols(), other)); + return cwiseMin(Derived::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise max of *this and \a other @@ -98,23 +114,39 @@ cwiseMin(const Scalar &other) const * * \sa class CwiseBinaryOp, min() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return cwiseMax(other); } /** \returns an expression of the coefficient-wise max of *this and scalar \a other * * \sa class CwiseBinaryOp, min() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMax(const Scalar &other) const +{ + return cwiseMax(Derived::Constant(rows(), cols(), other)); +} + EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> cwiseMax(const Scalar &other) const { - return cwiseMax(Derived::Constant(rows(), cols(), other)); + return cwiseMax(Derived::Constant(rows(), cols(), other)); } diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp index fb6be351e..8086b3432 100644 --- a/test/array_for_matrix.cpp +++ b/test/array_for_matrix.cpp @@ -211,6 +211,31 @@ template void cwise_min_max(const MatrixType& m) VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, maxM1).array(), (m1.array().max)( maxM1)); VERIFY_IS_APPROX(m1.array(), (m1.array().max)( minM1)); + // Test NaN propagation for min/max. + if (!NumTraits::IsInteger) { + m1(0,0) = NumTraits::quiet_NaN(); + // Elementwise. + VERIFY((numext::isnan)(m1.template cwiseMax(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY((numext::isnan)(m1.template cwiseMin(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMax(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMin(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + + VERIFY((numext::isnan)(m1.array().template max(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY((numext::isnan)(m1.array().template min(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY(!(numext::isnan)(m1.array().template max(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY(!(numext::isnan)(m1.array().template min(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + + // Reductions. + VERIFY((numext::isnan)(m1.template maxCoeff())); + VERIFY((numext::isnan)(m1.template minCoeff())); + if (m1.size() > 1) { + VERIFY(!(numext::isnan)(m1.template maxCoeff())); + VERIFY(!(numext::isnan)(m1.template minCoeff())); + } else { + VERIFY((numext::isnan)(m1.template maxCoeff())); + VERIFY((numext::isnan)(m1.template minCoeff())); + } + } } template void resize(const MatrixTraits& t) -- GitLab From 96007cae8c48f9a3cc3a67638ee8bf25ddf1517e Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 26 Oct 2021 18:09:22 +0000 Subject: [PATCH 121/266] Remove license column in tables for builtin sparse solvers since all are MPL2 now. (cherry picked from commit 68e0d023c06e3611b7955b38bf8def66275c1c09) --- doc/SparseLinearSystems.dox | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index 38754e4af..66d3bcd3c 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -13,24 +13,20 @@ They are summarized in the following tables: - + - - - -
ClassSolver kindMatrix kindFeatures related to performanceLicense

Notes

Notes

SimplicialLLT \n \#includeDirect LLt factorizationSPDFill-in reducingLGPL SimplicialLDLT is often preferable
SimplicialLDLT \n \#includeDirect LDLt factorizationSPDFill-in reducingLGPL Recommended for very sparse and not too large problems (e.g., 2D Poisson eq.)
SparseLU \n \#include LU factorization Square Fill-in reducing, Leverage fast dense algebraMPL2 optimized for small and large problems with irregular patterns
SparseQR \n \#include QR factorization Any, rectangular Fill-in reducingMPL2 recommended for least-square problems, has a basic rank-revealing feature
@@ -38,21 +34,18 @@ They are summarized in the following tables: - + - - -
ClassSolver kindMatrix kindSupported preconditioners, [default]License

Notes

Notes

ConjugateGradient \n \#include Classic iterative CGSPD IdentityPreconditioner, [DiagonalPreconditioner], IncompleteCholeskyMPL2 Recommended for large symmetric problems (e.g., 3D Poisson eq.)
LeastSquaresConjugateGradient \n \#includeCG for rectangular least-square problemRectangular IdentityPreconditioner, [LeastSquareDiagonalPreconditioner]MPL2 Solve for min |A'Ax-b|^2 without forming A'A
BiCGSTAB \n \#includeIterative stabilized bi-conjugate gradientSquare IdentityPreconditioner, [DiagonalPreconditioner], IncompleteLUTMPL2 To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.
-- GitLab From 6b6ba412695298460fb2cae640967ebc6843a8bf Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Thu, 28 Oct 2021 09:28:29 -0700 Subject: [PATCH 122/266] Fix min/max nan-propagation for scalar "other". Copied input type from `EIGEN_MAKE_CWISE_BINARY_OP`. Fixes #2362. (cherry picked from commit 03d4cbb30796ea06350414f5f551b180e4864688) --- Eigen/src/plugins/ArrayCwiseBinaryOps.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h index a9ddb9058..1b422e201 100644 --- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -38,7 +38,7 @@ min #else (min) #endif -(const OtherDerived &other) const +(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } @@ -102,7 +102,7 @@ max #else (max) #endif -(const OtherDerived &other) const +(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } -- GitLab From 9c193db5c71d145c8e142ef440d2867217f5de67 Mon Sep 17 00:00:00 2001 From: Xinle Liu Date: Wed, 3 Nov 2021 10:56:14 -0700 Subject: [PATCH 123/266] Fix BDCSVD's total deflation in branch 3.4, similar to that of master in MR 707. (cherry picked from commit 4d045eba53f9a32d052eb942448ba62def066529) --- Eigen/src/SVD/BDCSVD.h | 14 +++++++++----- test/bdcsvd.cpp | 42 ++++++++++++++++++++++++++++++++++++++---- 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 17f8e4436..a76a8dd04 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -27,6 +27,10 @@ #define eigen_internal_assert(X) assert(X); #endif +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE +#include +#endif + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -172,7 +176,7 @@ public: void setSwitchSize(int s) { - eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3"); + eigen_assert(s>=3 && "BDCSVD the size of the algo switch has to be at least 3."); m_algoswap = s; } @@ -404,7 +408,7 @@ void BDCSVD::structured_update(Block A, co //@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; // lastCol + 1 - firstCol is the size of the submatrix. //@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W) -//@param firstRowW : Same as firstRowW with the column. +//@param firstColW : Same as firstRowW with the column. //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template @@ -899,7 +903,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); eigen_internal_assert(fLeft::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif { // Check for total deflation - // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting - bool total_deflation = (col0.tail(length-1).array() -void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0) +void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0, int algoswap = 16, bool random = true) { - MatrixType m = MatrixType::Random(a.rows(), a.cols()); - BDCSVD bdc_svd(m); + MatrixType m = random ? MatrixType::Random(a.rows(), a.cols()) : a; + + BDCSVD bdc_svd(m.rows(), m.cols(), computationOptions); + bdc_svd.setSwitchSize(algoswap); + bdc_svd.compute(m); + JacobiSVD jacobi_svd(m); VERIFY_IS_APPROX(bdc_svd.singularValues(), jacobi_svd.singularValues()); + if(computationOptions & ComputeFullU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU()); if(computationOptions & ComputeThinU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU()); if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV()); if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV()); } +// Verifies total deflation is **not** triggered. +void compare_bdc_jacobi_instance(bool structure_as_m, int algoswap = 16) +{ + MatrixXd m(4, 3); + if (structure_as_m) { + // The first 3 rows are the reduced form of Matrix 1 as shown below, and it + // has nonzero elements in the first column and diagonals only. + m << 1.056293, 0, 0, + -0.336468, 0.907359, 0, + -1.566245, 0, 0.149150, + -0.1, 0, 0; + } else { + // Matrix 1. + m << 0.882336, 18.3914, -26.7921, + -5.58135, 17.1931, -24.0892, + -20.794, 8.68496, -4.83103, + -8.4981, -10.5451, 23.9072; + } + compare_bdc_jacobi(m, 0, algoswap, false); +} + EIGEN_DECLARE_TEST(bdcsvd) { CALL_SUBTEST_3(( svd_verify_assert >(Matrix3f()) )); @@ -114,5 +140,13 @@ EIGEN_DECLARE_TEST(bdcsvd) // CALL_SUBTEST_9( svd_preallocate() ); CALL_SUBTEST_2( svd_underoverflow() ); + + // Without total deflation issues. + CALL_SUBTEST_11(( compare_bdc_jacobi_instance(true) )); + CALL_SUBTEST_12(( compare_bdc_jacobi_instance(false) )); + + // With total deflation issues before, when it shouldn't be triggered. + CALL_SUBTEST_13(( compare_bdc_jacobi_instance(true, 3) )); + CALL_SUBTEST_14(( compare_bdc_jacobi_instance(false, 3) )); } -- GitLab From f9b2e92040f52739585a9036a72c2b8fa3576013 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 29 Oct 2021 09:28:11 -0700 Subject: [PATCH 124/266] Remove bad "take" impl that causes g++-11 crash. For some reason, having `take>` for `n > 0` causes g++-11 to ICE with ``` sorry, unimplemented: unexpected AST of kind nontype_argument_pack ``` It does work with other versions of gcc, and with clang. I filed a GCC bug [here](https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102999). Technically we should never actually run into this case, since you can't take n > 0 elements from an empty list. Commenting it out allows our Eigen tests to pass. (cherry picked from commit 8f8c2ba2fe19c6c2e47bbe2fbaf87594642e523d) --- unsupported/Eigen/CXX11/src/util/CXX11Meta.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h index 149ceaff0..f662dee5b 100644 --- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -81,7 +81,8 @@ template struct take<0, type_list> template<> struct take<0, type_list<>> { typedef type_list<> type; }; template struct take> : concat, typename take>::type> {}; -template struct take> { typedef numeric_list type; }; +// XXX The following breaks in gcc-11, and is invalid anyways. +// template struct take> { typedef numeric_list type; }; template struct take<0, numeric_list> { typedef numeric_list type; }; template struct take<0, numeric_list> { typedef numeric_list type; }; -- GitLab From 18824d10eaa46b39d42ae2b437fe760a2be25a7b Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 22 Oct 2021 08:52:16 -0700 Subject: [PATCH 125/266] Fix ZVector build. Cross-compiled via `s390x-linux-gnu-g++`, run via qemu. This allows the packetmath tests to pass. (cherry picked from commit 40bbe8a4d0eb3ec2bfd472fa30cac19e6e743b46) --- Eigen/src/Core/arch/ZVector/Complex.h | 14 ++++++++++++-- Eigen/src/Core/arch/ZVector/PacketMath.h | 4 ++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index 0b9b33d99..e0acedefb 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -91,8 +91,18 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet2cf half; + typedef Packet4f as_real; +}; +template<> struct unpacket_traits { + typedef std::complex type; + enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet1cd half; + typedef Packet2d as_real; +}; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 1f55a90a5..208e4b167 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -91,8 +91,8 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); static Packet2d p2d_ONE = { 1.0, 1.0 }; -static Packet2d p2d_ZERO_ = { numext::bit_cast0x8000000000000000ull), - numext::bit_cast0x8000000000000000ull) }; +static Packet2d p2d_ZERO_ = { numext::bit_cast(0x8000000000000000ull), + numext::bit_cast(0x8000000000000000ull) }; #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ -- GitLab From 23469c3cdaeeeb7b9f31e2d0971cddd114f1587d Mon Sep 17 00:00:00 2001 From: Andreas Krebbel Date: Tue, 26 Oct 2021 15:29:30 +0200 Subject: [PATCH 126/266] ZVector: Move alignas qualifier to come first We currently have plenty of type definitions with the alignment qualifier coming after the type. The compiler warns about ignoring them: int EIGEN_ALIGN16 ai[4]; Turn this into: EIGEN_ALIGN16 int ai[4]; (cherry picked from commit 8faafc3aaa2b45e234cfe0bef085c1134ceffc42) --- Eigen/src/Core/arch/ZVector/Complex.h | 8 ++++---- Eigen/src/Core/arch/ZVector/PacketMath.h | 24 ++++++++++++------------ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index e0acedefb..6c67cfe05 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -160,7 +160,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res; + EIGEN_ALIGN16 std::complex res; pstore >(&res, a); return res; @@ -205,7 +205,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore >(res, a); return res[0]; @@ -235,14 +235,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; pstore >((std::complex *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index 208e4b167..a7b59c80e 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -358,7 +358,7 @@ pbroadcast4(const double *a, template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -368,7 +368,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); @@ -376,7 +376,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const dou template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; pstore((int *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -386,7 +386,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -460,8 +460,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { @@ -639,7 +639,7 @@ pbroadcast4(const float *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 float ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -649,7 +649,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 float ai[4]; pstore((float *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -785,7 +785,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) return p; } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -943,7 +943,7 @@ pbroadcast4(const float *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; af[0] = from[0*stride]; af[1] = from[1*stride]; af[2] = from[2*stride]; @@ -953,7 +953,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; pstore((float*)af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -978,7 +978,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { r template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { -- GitLab From b0fe14213ec4c8615de485fe4c2bcd4bab0cbd19 Mon Sep 17 00:00:00 2001 From: Alex Druinsky Date: Wed, 20 Oct 2021 16:03:12 -0700 Subject: [PATCH 127/266] Fix vectorized reductions for Eigen::half Fixes compiler errors in expressions that look like Eigen::Matrix::Random().maxCoeff() The error comes from the code that creates the initial value for vectorized reductions. The fix is to specify the scalar type of the reduction's initial value. The cahnge is necessary for Eigen::half because unlike other types, Eigen::half scalars cannot be implicitly created from integers. (cherry picked from commit d0e3791b1a0e2db9edd5f1d1befdb2ac5a40efe0) --- Eigen/src/Core/PartialReduxEvaluator.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h index 29abf35b9..17c06f078 100644 --- a/Eigen/src/Core/PartialReduxEvaluator.h +++ b/Eigen/src/Core/PartialReduxEvaluator.h @@ -54,12 +54,17 @@ struct packetwise_redux_traits /* Value to be returned when size==0 , by default let's return 0 */ template EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } +PacketType packetwise_redux_empty_value(const Func& ) { + const typename unpacket_traits::type zero(0); + return pset1(zero); +} /* For products the default is 1 */ template EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } +PacketType packetwise_redux_empty_value(const scalar_product_op& ) { + return pset1(Scalar(1)); +} /* Perform the actual reduction */ template Date: Thu, 21 Oct 2021 19:57:00 -0700 Subject: [PATCH 128/266] Fix broadcasting oob error. For vectorized 1-dimensional inputs that do not take the special blocking path (e.g. `std::complex<...>`), there was an index-out-of-bounds error causing the broadcast size to be computed incorrectly. Here we fix this, and make other minor cleanup changes. Fixes #2351. (cherry picked from commit a500da1dc089b08e2f2b3b05a2eb23194425460e) --- .../CXX11/src/Tensor/TensorBroadcasting.h | 59 +++++++++---------- .../test/cxx11_tensor_broadcasting.cpp | 18 ++++++ 2 files changed, 46 insertions(+), 31 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index a354132f6..8d8ad2658 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -127,7 +127,7 @@ struct TensorEvaluator, Device> typedef DSizes BroadcastDimensions; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; typedef typename TensorEvaluator::TensorBlock @@ -144,7 +144,7 @@ struct TensorEvaluator, Device> { // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar - // and store the result in a scalar. Instead one should reshape the scalar into a a N-D + // and store the result in a scalar. Instead one should reshape the scalar into a N-D // tensor with N >= 1 of 1 element first and then broadcast. EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); const InputDimensions& input_dims = m_impl.dimensions(); @@ -410,25 +410,24 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const { + // Consider the flattened tensor [v0, ..., vN], + // Concatenates m_broadcast[dim] copies, + // [v0, ..., vN, v0, ..., vN, ... ] + // with dim == NumDims - 1 for col-major, dim == 0 for row-major. EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - Index dim, inputIndex; - - if (static_cast(Layout) == static_cast(ColMajor)) { - dim = NumDims - 1; - } else { - dim = 0; - } - - inputIndex = index % m_inputStrides[dim]; - if (inputIndex + PacketSize <= m_inputStrides[dim]) { + // Size of flattened tensor. + const Index M = (static_cast(Layout) == static_cast(ColMajor)) ? + m_inputStrides[NumDims - 1] : m_inputStrides[0]; + Index inputIndex = index % M; + if (inputIndex + PacketSize <= M) { return m_impl.template packet(inputIndex); } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { - if (inputIndex > m_inputStrides[dim]-1) { + if (inputIndex > M - 1) { inputIndex = 0; } values[i] = m_impl.coeff(inputIndex++); @@ -440,32 +439,30 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const { + // Consider the flattened tensor [v0, ..., vN], + // Interleaves m_broadcast[dim] copies, + // [v0, v0, ..., v1, v1, ..., vN, vN, ... ] + // with dim == 0 for col-major, dim == NumDims - 1 for row-major. EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); + eigen_assert(index + PacketSize-1 < dimensions().TotalSize()); - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - Index dim, inputIndex, outputOffset; + const Index M = (static_cast(Layout) == static_cast(ColMajor)) ? + m_broadcast[0] : m_broadcast[NumDims - 1]; - if (static_cast(Layout) == static_cast(ColMajor)) { - dim = 1; - } else { - dim = NumDims - 2; - } - - inputIndex = index / m_outputStrides[dim]; - outputOffset = index % m_outputStrides[dim]; - if (outputOffset + PacketSize <= m_outputStrides[dim]) { - values[0] = m_impl.coeff(inputIndex); - return internal::pload1(values); + Index inputIndex = index / M; + Index outputOffset = index % M; + if (outputOffset + PacketSize <= M) { + return internal::pset1(m_impl.coeff(inputIndex)); } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; EIGEN_UNROLL_LOOP - for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { - if (outputOffset + cur < m_outputStrides[dim]) { + for (int i = 0; i < PacketSize; ++i) { + if (outputOffset < M) { values[i] = m_impl.coeff(inputIndex); + ++outputOffset; } else { - values[i] = m_impl.coeff(++inputIndex); outputOffset = 0; - cur = 0; + values[i] = m_impl.coeff(++inputIndex); } } return internal::pload(values); diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index d3dab891f..cbd92c328 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -256,6 +256,22 @@ static void test_simple_broadcasting_n_by_one() } } +template +static void test_size_one_broadcasting() +{ + Tensor tensor(1); + tensor.setRandom(); + array broadcasts = {64}; + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), broadcasts[0]); + + for (int i = 0; i < broadcasts[0]; ++i) { + VERIFY_IS_EQUAL(tensor(0), broadcast(i)); + } +} + template static void test_simple_broadcasting_one_by_n_by_one_1d() { @@ -328,4 +344,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting) CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d()); CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d()); CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d()); + CALL_SUBTEST(test_size_one_broadcasting()); + CALL_SUBTEST(test_size_one_broadcasting()); } -- GitLab From 962a596d21332ec6901ba4a0787b3435264270c2 Mon Sep 17 00:00:00 2001 From: "Maxiwell S. Garcia" Date: Fri, 22 Oct 2021 12:17:55 -0500 Subject: [PATCH 129/266] test: fix boostmutiprec test to compile with older Boost versions Eigen boostmultiprec test redefines a symbol that is already defined inside Boot Math [1]. Boost has fixed it recently [2], but this patch avoids errors if Boost version was less than 1.77. https://github.com/boostorg/math/blob/boost-1.76.0/include/boost/math/policies/policy.hpp#L18 https://github.com/boostorg/math/commit/68307123029676ba5cb316f8dd1d1c98d1fc7b23#diff-c7a8e5911c2e6be4138e1a966d762200f147792ac16ad96fdcc724313d11f839 (cherry picked from commit 99600bd1a6b133f8684800c855e59d4fd22ac23e) --- test/boostmultiprec.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp index 7c79ded23..e83e97044 100644 --- a/test/boostmultiprec.cpp +++ b/test/boostmultiprec.cpp @@ -74,8 +74,7 @@ #include #include -namespace mp = boost::multiprecision; -typedef mp::number, mp::et_on> Real; +typedef boost::multiprecision::number, boost::multiprecision::et_on> Real; namespace Eigen { template<> struct NumTraits : GenericNumTraits { -- GitLab From 71320af66a2401cf7d843de9b785655014a5e261 Mon Sep 17 00:00:00 2001 From: Nico Date: Tue, 19 Oct 2021 16:52:57 +0000 Subject: [PATCH 130/266] Fix -Wbitwise-instead-of-logical clang warning & and | short-circuit, && and || don't. When both arguments to those are boolean, the short-circuiting version is usually the desired one, so clang warns on this. Here, it is inconsequential, so switch to && and || to suppress the warning. (cherry picked from commit b17bcddbca749f621040990a3efb840046315050) --- unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index d23f2e4c8..afbcba4a2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -78,14 +78,14 @@ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator == (const TensorUInt128& lhs, const TensorUInt128& rhs) { - return (lhs.high == rhs.high) & (lhs.low == rhs.low); + return (lhs.high == rhs.high) && (lhs.low == rhs.low); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator != (const TensorUInt128& lhs, const TensorUInt128& rhs) { - return (lhs.high != rhs.high) | (lhs.low != rhs.low); + return (lhs.high != rhs.high) || (lhs.low != rhs.low); } template -- GitLab From fbdaff81bdbaac02a7f5e3afa03e2465e0a9c025 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 28 Oct 2021 21:59:41 +0000 Subject: [PATCH 131/266] Invert rows and depth in non-vectorized portion of packing (PowerPC). (cherry picked from commit 9cf34ee0aed25a7464e6ec14f977cfa940f48f1b) --- Eigen/src/Core/arch/AltiVec/Complex.h | 10 +- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 1552 ++++++++--------- .../Core/arch/AltiVec/MatrixProductCommon.h | 206 +-- .../src/Core/arch/AltiVec/MatrixProductMMA.h | 337 ++-- 4 files changed, 931 insertions(+), 1174 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index f424f11cf..b3932998c 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -127,20 +127,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } -EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex* from0, const std::complex* from1) +EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex& from0, const std::complex& from1) { Packet4f res0, res1; #ifdef __VSX__ - __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0)); - __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1)); + __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0)); + __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1)); #ifdef _BIG_ENDIAN __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); #else __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); #endif #else - *reinterpret_cast *>(&res0) = *from0; - *reinterpret_cast *>(&res1) = *from1; + *reinterpret_cast *>(&res0) = from0; + *reinterpret_cast *>(&res1) = from1; res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI); #endif return Packet2cf(res0); diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 3f79b97df..8feb88ea7 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -164,24 +164,23 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* bloc rir += vectorDelta; } - if (j < cols) + + for(; j < cols; j++) { - rii = rir + ((cols - j) * rows); + rii = rir + rows; for(Index i = k2; i < depth; i++) { - Index k = j; - for(; k < cols; k++) - { - std::complex v = getAdjointVal(i, k, rhs); + std::complex v = getAdjointVal(i, j, rhs); - blockBf[rir] = v.real(); - blockBf[rii] = v.imag(); + blockBf[rir] = v.real(); + blockBf[rii] = v.imag(); - rir += 1; - rii += 1; - } + rir += 1; + rii += 1; } + + rir += rows; } } @@ -260,19 +259,15 @@ EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs } } - if (j < cols) + for(; j < cols; j++) { for(Index i = k2; i < depth; i++) { - Index k = j; - for(; k < cols; k++) - { - if(k <= i) - blockB[ri] = rhs(i, k); - else - blockB[ri] = rhs(k, i); - ri += 1; - } + if(j <= i) + blockB[ri] = rhs(i, j); + else + blockB[ri] = rhs(j, i); + ri += 1; } } } @@ -406,22 +401,18 @@ struct symm_pack_lhs * and offset and behaves accordingly. **/ -template -EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) -{ - const Index size = 16 / sizeof(Scalar); - pstore(to + (0 * size), block.packet[0]); - pstore(to + (1 * size), block.packet[1]); - pstore(to + (2 * size), block.packet[2]); - pstore(to + (3 * size), block.packet[3]); -} - -template -EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) +template +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); pstore(to + (1 * size), block.packet[1]); + if (N > 2) { + pstore(to + (2 * size), block.packet[2]); + } + if (N > 3) { + pstore(to + (3 * size), block.packet[3]); + } } // General template for lhs & rhs complex packing. @@ -447,9 +438,9 @@ struct dhs_cpack { PacketBlock cblock; if (UseLhs) { - bload(cblock, lhs, j, i); + bload(cblock, lhs, j, i); } else { - bload(cblock, lhs, i, j); + bload(cblock, lhs, i, j); } blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); @@ -476,8 +467,8 @@ struct dhs_cpack { ptranspose(blocki); } - storeBlock(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 4*vectorSize; rii += 4*vectorSize; @@ -497,21 +488,12 @@ struct dhs_cpack { cblock.packet[1] = lhs.template loadPacket(i, j + 2); } } else { - std::complex lhs0, lhs1; if (UseLhs) { - lhs0 = lhs(j + 0, i); - lhs1 = lhs(j + 1, i); - cblock.packet[0] = pload2(&lhs0, &lhs1); - lhs0 = lhs(j + 2, i); - lhs1 = lhs(j + 3, i); - cblock.packet[1] = pload2(&lhs0, &lhs1); + cblock.packet[0] = pload2(lhs(j + 0, i), lhs(j + 1, i)); + cblock.packet[1] = pload2(lhs(j + 2, i), lhs(j + 3, i)); } else { - lhs0 = lhs(i, j + 0); - lhs1 = lhs(i, j + 1); - cblock.packet[0] = pload2(&lhs0, &lhs1); - lhs0 = lhs(i, j + 2); - lhs1 = lhs(i, j + 3); - cblock.packet[1] = pload2(&lhs0, &lhs1); + cblock.packet[0] = pload2(lhs(i, j + 0), lhs(i, j + 1)); + cblock.packet[1] = pload2(lhs(i, j + 2), lhs(i, j + 3)); } } @@ -533,34 +515,50 @@ struct dhs_cpack { rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta); } - if (j < rows) + if (!UseLhs) { - if(PanelMode) rir += (offset*(rows - j - vectorSize)); - rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + if(PanelMode) rir -= (offset*(vectorSize - 1)); - for(Index i = 0; i < depth; i++) + for(; j < rows; j++) { - Index k = j; - for(; k < rows; k++) + rii = rir + ((PanelMode) ? stride : depth); + + for(Index i = 0; i < depth; i++) { - if (UseLhs) { + blockAt[rir] = lhs(i, j).real(); + + if(Conjugate) + blockAt[rii] = -lhs(i, j).imag(); + else + blockAt[rii] = lhs(i, j).imag(); + + rir += 1; + rii += 1; + } + + rir += ((PanelMode) ? (2*stride - depth) : depth); + } + } else { + if (j < rows) + { + if(PanelMode) rir += (offset*(rows - j - vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { blockAt[rir] = lhs(k, i).real(); if(Conjugate) blockAt[rii] = -lhs(k, i).imag(); else blockAt[rii] = lhs(k, i).imag(); - } else { - blockAt[rir] = lhs(i, k).real(); - if(Conjugate) - blockAt[rii] = -lhs(i, k).imag(); - else - blockAt[rii] = lhs(i, k).imag(); + rir += 1; + rii += 1; } - - rir += 1; - rii += 1; } } } @@ -586,16 +584,16 @@ struct dhs_pack{ PacketBlock block; if (UseLhs) { - bload(block, lhs, j, i); + bload(block, lhs, j, i); } else { - bload(block, lhs, i, j); + bload(block, lhs, i, j); } if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) { ptranspose(block); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 4*vectorSize; } @@ -630,21 +628,33 @@ struct dhs_pack{ if(PanelMode) ri += vectorSize*(stride - offset - depth); } - if (j < rows) + if (!UseLhs) { - if(PanelMode) ri += offset*(rows - j); + if(PanelMode) ri += offset; - for(Index i = 0; i < depth; i++) + for(; j < rows; j++) { - Index k = j; - for(; k < rows; k++) + for(Index i = 0; i < depth; i++) { - if (UseLhs) { + blockA[ri] = lhs(i, j); + ri += 1; + } + + if(PanelMode) ri += stride - depth; + } + } else { + if (j < rows) + { + if(PanelMode) ri += offset*(rows - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { blockA[ri] = lhs(k, i); - } else { - blockA[ri] = lhs(i, k); + ri += 1; } - ri += 1; } } } @@ -680,7 +690,7 @@ struct dhs_pack(j, i + 1); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 2*vectorSize; } @@ -757,7 +767,7 @@ struct dhs_pack(i + 1, j + 0); //[b1 b2] block.packet[3] = rhs.template loadPacket(i + 1, j + 2); //[b3 b4] - storeBlock(blockB + ri, block); + storeBlock(blockB + ri, block); } ri += 4*vectorSize; @@ -788,19 +798,17 @@ struct dhs_pack(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -941,7 +949,7 @@ struct dhs_cpack cblock; PacketBlock blockr, blocki; - bload(cblock, rhs, i, j); + bload(cblock, rhs, i, j); blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); @@ -955,8 +963,8 @@ struct dhs_cpack(blockBt + rir, blockr); - storeBlock(blockBt + rii, blocki); + storeBlock(blockBt + rir, blockr); + storeBlock(blockBt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -965,27 +973,26 @@ struct dhs_cpack -EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) -{ - if(NegativeAccumulate) - { - acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); - acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); - acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); - acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); - } else { - acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); - acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); - acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); - acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); - } -} - -template -EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +template +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); + if (N > 1) { + acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); + } + if (N > 2) { + acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); + } + if (N > 3) { + acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); + } } else { acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); + if (N > 1) { + acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); + } + if (N > 2) { + acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); + } + if (N > 3) { + acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); + } } } @@ -1028,11 +1036,11 @@ EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, con { Packet lhsV = pload(lhs); - pger_common(acc, lhsV, rhsV); + pger_common(acc, lhsV, rhsV); } -template -EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); @@ -1044,32 +1052,32 @@ EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, In #endif } -template -EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) { Packet lhsV; - loadPacketRemaining(lhs, lhsV, remaining_rows); + loadPacketRemaining(lhs, lhsV); - pger_common(acc, lhsV, rhsV); + pger_common(acc, lhsV, rhsV); } // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. template EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) { - pger_common(accReal, lhsV, rhsV); + pger_common(accReal, lhsV, rhsV); if(LhsIsReal) { - pger_common(accImag, lhsV, rhsVi); + pger_common(accImag, lhsV, rhsVi); EIGEN_UNUSED_VARIABLE(lhsVi); } else { if (!RhsIsReal) { - pger_common(accReal, lhsVi, rhsVi); - pger_common(accImag, lhsV, rhsVi); + pger_common(accReal, lhsVi, rhsVi); + pger_common(accImag, lhsV, rhsVi); } else { EIGEN_UNUSED_VARIABLE(rhsVi); } - pger_common(accImag, lhsVi, rhsV); + pger_common(accImag, lhsVi, rhsV); } } @@ -1084,8 +1092,8 @@ EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); } -template -EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); @@ -1101,11 +1109,11 @@ EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar #endif } -template -EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) { Packet lhsV, lhsVi; - loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); + loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi); pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); } @@ -1117,132 +1125,142 @@ EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) } // Zero the accumulator on PacketBlock. -template -EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) -{ - acc.packet[0] = pset1((Scalar)0); - acc.packet[1] = pset1((Scalar)0); - acc.packet[2] = pset1((Scalar)0); - acc.packet[3] = pset1((Scalar)0); -} - -template -EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) +template +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); + if (N > 1) { + acc.packet[1] = pset1((Scalar)0); + } + if (N > 2) { + acc.packet[2] = pset1((Scalar)0); + } + if (N > 3) { + acc.packet[3] = pset1((Scalar)0); + } } // Scale the PacketBlock vectors by alpha. -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) -{ - acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); - acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); - acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); - acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); -} - -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); + if (N > 1) { + acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); + } + if (N > 2) { + acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); + } + if (N > 3) { + acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); + } } -template -EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) -{ - acc.packet[0] = pmul(accZ.packet[0], pAlpha); - acc.packet[1] = pmul(accZ.packet[1], pAlpha); - acc.packet[2] = pmul(accZ.packet[2], pAlpha); - acc.packet[3] = pmul(accZ.packet[3], pAlpha); -} - -template -EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +template +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmul(accZ.packet[0], pAlpha); + if (N > 1) { + acc.packet[1] = pmul(accZ.packet[1], pAlpha); + } + if (N > 2) { + acc.packet[2] = pmul(accZ.packet[2], pAlpha); + } + if (N > 3) { + acc.packet[3] = pmul(accZ.packet[3], pAlpha); + } } // Complex version of PacketBlock scaling. template EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) { - bscalec_common(cReal, aReal, bReal); + bscalec_common(cReal, aReal, bReal); - bscalec_common(cImag, aImag, bReal); + bscalec_common(cImag, aImag, bReal); - pger_common(&cReal, bImag, aImag.packet); + pger_common(&cReal, bImag, aImag.packet); - pger_common(&cImag, bImag, aReal.packet); + pger_common(&cImag, bImag, aReal.packet); } -template -EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) +template +EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) { acc.packet[0] = pand(acc.packet[0], pMask); - acc.packet[1] = pand(acc.packet[1], pMask); - acc.packet[2] = pand(acc.packet[2], pMask); - acc.packet[3] = pand(acc.packet[3], pMask); + if (N > 1) { + acc.packet[1] = pand(acc.packet[1], pMask); + } + if (N > 2) { + acc.packet[2] = pand(acc.packet[2], pMask); + } + if (N > 3) { + acc.packet[3] = pand(acc.packet[3], pMask); + } } -template -EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) { - band(aReal, pMask); - band(aImag, pMask); + band(aReal, pMask); + band(aImag, pMask); - bscalec(aReal, aImag, bReal, bImag, cReal, cImag); + bscalec(aReal, aImag, bReal, bImag, cReal, cImag); } // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed. -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) -{ - if (StorageOrder == RowMajor) { - acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); - acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); - acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); - acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); - } else { - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); - acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); - acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); - } -} - -// An overload of bload when you have a PacketBLock with 8 vectors. -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { - acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); - acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); - acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); - acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); - acc.packet[4] = res.template loadPacket(row + 0, col + (N+1)*accCols); - acc.packet[5] = res.template loadPacket(row + 1, col + (N+1)*accCols); - acc.packet[6] = res.template loadPacket(row + 2, col + (N+1)*accCols); - acc.packet[7] = res.template loadPacket(row + 3, col + (N+1)*accCols); + acc.packet[0] = res.template loadPacket(row + 0, col); + if (N > 1) { + acc.packet[1] = res.template loadPacket(row + 1, col); + } + if (N > 2) { + acc.packet[2] = res.template loadPacket(row + 2, col); + } + if (N > 3) { + acc.packet[3] = res.template loadPacket(row + 3, col); + } + if (Complex) { + acc.packet[0+N] = res.template loadPacket(row + 0, col + accCols); + if (N > 1) { + acc.packet[1+N] = res.template loadPacket(row + 1, col + accCols); + } + if (N > 2) { + acc.packet[2+N] = res.template loadPacket(row + 2, col + accCols); + } + if (N > 3) { + acc.packet[3+N] = res.template loadPacket(row + 3, col + accCols); + } + } } else { - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); - acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); - acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); - acc.packet[4] = res.template loadPacket(row + (N+1)*accCols, col + 0); - acc.packet[5] = res.template loadPacket(row + (N+1)*accCols, col + 1); - acc.packet[6] = res.template loadPacket(row + (N+1)*accCols, col + 2); - acc.packet[7] = res.template loadPacket(row + (N+1)*accCols, col + 3); + acc.packet[0] = res.template loadPacket(row, col + 0); + if (N > 1) { + acc.packet[1] = res.template loadPacket(row, col + 1); + } + if (N > 2) { + acc.packet[2] = res.template loadPacket(row, col + 2); + } + if (N > 3) { + acc.packet[3] = res.template loadPacket(row, col + 3); + } + if (Complex) { + acc.packet[0+N] = res.template loadPacket(row + accCols, col + 0); + if (N > 1) { + acc.packet[1+N] = res.template loadPacket(row + accCols, col + 1); + } + if (N > 2) { + acc.packet[2+N] = res.template loadPacket(row + accCols, col + 2); + } + if (N > 3) { + acc.packet[3+N] = res.template loadPacket(row + accCols, col + 3); + } + } } } -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) -{ - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); -} - const static Packet4i mask41 = { -1, 0, 0, 0 }; const static Packet4i mask42 = { -1, -1, 0, 0 }; const static Packet4i mask43 = { -1, -1, -1, 0 }; @@ -1273,22 +1291,44 @@ EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) } } -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) { - band(accZ, pMask); + band(accZ, pMask); - bscale(acc, accZ, pAlpha); + bscale(acc, accZ, pAlpha); } -template -EIGEN_ALWAYS_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) +template EIGEN_ALWAYS_INLINE void +pbroadcastN_old(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + a0 = pset1(a[0]); + if (N > 1) { + a1 = pset1(a[1]); + } else { + EIGEN_UNUSED_VARIABLE(a1); + } + if (N > 2) { + a2 = pset1(a[2]); + } else { + EIGEN_UNUSED_VARIABLE(a2); + } + if (N > 3) { + a3 = pset1(a[3]); + } else { + EIGEN_UNUSED_VARIABLE(a3); + } +} + +template<> +EIGEN_ALWAYS_INLINE void pbroadcastN_old(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - pbroadcast4(a, a0, a1, a2, a3); + pbroadcast4(a, a0, a1, a2, a3); } template<> -EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +EIGEN_ALWAYS_INLINE void pbroadcastN_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) { a1 = pload(a); a3 = pload(a + 2); @@ -1298,89 +1338,96 @@ EIGEN_ALWAYS_INLINE void pbroadcast4_old(const double* a, Packet2d& a0 a3 = vec_splat(a3, 1); } -// PEEL loop factor. -#define PEEL 7 - -template -EIGEN_ALWAYS_INLINE void MICRO_EXTRA_COL( - const Scalar* &lhs_ptr, - const Scalar* &rhs_ptr, - PacketBlock &accZero, - Index remaining_rows, - Index remaining_cols) +template EIGEN_ALWAYS_INLINE void +pbroadcastN(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) { - Packet rhsV[1]; - rhsV[0] = pset1(rhs_ptr[0]); - pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); - lhs_ptr += remaining_rows; - rhs_ptr += remaining_cols; + a0 = pset1(a[0]); + if (N > 1) { + a1 = pset1(a[1]); + } else { + EIGEN_UNUSED_VARIABLE(a1); + } + if (N > 2) { + a2 = pset1(a[2]); + } else { + EIGEN_UNUSED_VARIABLE(a2); + } + if (N > 3) { + a3 = pset1(a[3]); + } else { + EIGEN_UNUSED_VARIABLE(a3); + } } -template -EIGEN_STRONG_INLINE void gemm_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlpha) +template<> EIGEN_ALWAYS_INLINE void +pbroadcastN(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; - PacketBlock accZero; + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} - bsetzero(accZero); +// PEEL loop factor. +#define PEEL 7 +#define PEEL_ROW 7 - Index remaining_depth = (depth & -accRows); - Index k = 0; - for(; k + PEEL <= remaining_depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - EIGEN_POWER_PREFETCH(lhs_ptr); - for (int l = 0; l < PEEL; l++) { - MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); - } - } - for(; k < remaining_depth; k++) - { - MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); +#define MICRO_UNROLL_PEEL(func) \ + func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) + +#define MICRO_ZERO_PEEL(peel) \ + if ((PEEL_ROW > peel) && (peel != 0)) { \ + bsetzero(accZero##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accZero##peel); \ } - for(; k < depth; k++) - { - Packet rhsV[1]; - rhsV[0] = pset1(rhs_ptr[0]); - pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); - lhs_ptr += remaining_rows; - rhs_ptr += remaining_cols; + +#define MICRO_ZERO_PEEL_ROW \ + MICRO_UNROLL_PEEL(MICRO_ZERO_PEEL); + +#define MICRO_WORK_PEEL(peel) \ + if (PEEL_ROW > peel) { \ + pbroadcastN(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + pger(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ } - accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]); - for(Index i = 0; i < remaining_rows; i++) { - res(row + i, col) += accZero.packet[0][i]; +#define MICRO_WORK_PEEL_ROW \ + Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \ + MICRO_UNROLL_PEEL(MICRO_WORK_PEEL); \ + lhs_ptr += (remaining_rows * PEEL_ROW); \ + rhs_ptr += (accRows * PEEL_ROW); + +#define MICRO_ADD_PEEL(peel, sum) \ + if (PEEL_ROW > peel) { \ + for (Index i = 0; i < accRows; i++) { \ + accZero##sum.packet[i] += accZero##peel.packet[i]; \ + } \ } -} -template +#define MICRO_ADD_PEEL_ROW \ + MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \ + MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0) + +template EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, - PacketBlock &accZero, - Index remaining_rows) + PacketBlock &accZero) { Packet rhsV[4]; - pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); + pbroadcastN(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger(&accZero, lhs_ptr, rhsV); lhs_ptr += remaining_rows; rhs_ptr += accRows; } -template -EIGEN_STRONG_INLINE void gemm_extra_row( +template +EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -1391,59 +1438,89 @@ EIGEN_STRONG_INLINE void gemm_extra_row( Index col, Index rows, Index cols, - Index remaining_rows, const Packet& pAlpha, const Packet& pMask) { const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; - PacketBlock accZero, acc; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc; - bsetzero(accZero); + bsetzero(accZero0); - Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); + Index remaining_depth = (col + quad_traits::rows < cols) ? depth : (depth & -quad_traits::rows); Index k = 0; - for(; k + PEEL <= remaining_depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - EIGEN_POWER_PREFETCH(lhs_ptr); - for (int l = 0; l < PEEL; l++) { - MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); - } + if (remaining_depth >= PEEL_ROW) { + MICRO_ZERO_PEEL_ROW + do + { + EIGEN_POWER_PREFETCH(rhs_ptr); + EIGEN_POWER_PREFETCH(lhs_ptr); + MICRO_WORK_PEEL_ROW + } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth); + MICRO_ADD_PEEL_ROW } for(; k < remaining_depth; k++) { - MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); + MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero0); } if ((remaining_depth == depth) && (rows >= accCols)) { - for(Index j = 0; j < 4; j++) { - acc.packet[j] = res.template loadPacket(row, col + j); - } - bscale(acc, accZero, pAlpha, pMask); - res.template storePacketBlock(row, col, acc); + bload(acc, res, row, 0); + bscale(acc, accZero0, pAlpha, pMask); + res.template storePacketBlock(row, 0, acc); } else { for(; k < depth; k++) { Packet rhsV[4]; - pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); + pbroadcastN(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger(&accZero0, lhs_ptr, rhsV); lhs_ptr += remaining_rows; rhs_ptr += accRows; } - for(Index j = 0; j < 4; j++) { - accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]); - } - for(Index j = 0; j < 4; j++) { + for(Index j = 0; j < accRows; j++) { + accZero0.packet[j] = vec_mul(pAlpha, accZero0.packet[j]); for(Index i = 0; i < remaining_rows; i++) { - res(row + i, col + j) += accZero.packet[j][i]; + res(row + i, j) += accZero0.packet[j][i]; } } } } +template +EIGEN_ALWAYS_INLINE void gemm_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + switch(remaining_rows) { + case 1: + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); + break; + case 2: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); + } + break; + default: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); + } + break; + } +} + #define MICRO_UNROLL(func) \ func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) @@ -1462,34 +1539,24 @@ EIGEN_STRONG_INLINE void gemm_extra_row( #define MICRO_WORK_ONE(iter, peel) \ if (unroll_factor > iter) { \ - pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ + pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ } #define MICRO_TYPE_PEEL4(func, func2, peel) \ if (PEEL > peel) { \ Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - pbroadcast4(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ - MICRO_UNROLL_WORK(func, func2, peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ - } - -#define MICRO_TYPE_PEEL1(func, func2, peel) \ - if (PEEL > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - rhsV##peel[0] = pset1(rhs_ptr[remaining_cols * peel]); \ + pbroadcastN(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ MICRO_UNROLL_WORK(func, func2, peel) \ } else { \ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ } #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \ - Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \ func(func1,func2,0); func(func1,func2,1); \ func(func1,func2,2); func(func1,func2,3); \ func(func1,func2,4); func(func1,func2,5); \ - func(func1,func2,6); func(func1,func2,7); \ - func(func1,func2,8); func(func1,func2,9); + func(func1,func2,6); func(func1,func2,7); #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \ Packet rhsV0[M]; \ @@ -1503,17 +1570,9 @@ EIGEN_STRONG_INLINE void gemm_extra_row( MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ rhs_ptr += accRows; -#define MICRO_ONE_PEEL1 \ - MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += (remaining_cols * PEEL); - -#define MICRO_ONE1 \ - MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += remaining_cols; - #define MICRO_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzero(accZero##iter); \ + bsetzero(accZero##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accZero##iter); \ } @@ -1522,7 +1581,7 @@ EIGEN_STRONG_INLINE void gemm_extra_row( #define MICRO_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ } @@ -1538,25 +1597,13 @@ EIGEN_STRONG_INLINE void gemm_extra_row( #define MICRO_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ - acc.packet[1] = res.template loadPacket(row + iter*accCols, col + 1); \ - acc.packet[2] = res.template loadPacket(row + iter*accCols, col + 2); \ - acc.packet[3] = res.template loadPacket(row + iter*accCols, col + 3); \ - bscale(acc, accZero##iter, pAlpha); \ - res.template storePacketBlock(row + iter*accCols, col, acc); \ + bload(acc, res, row + iter*accCols, 0); \ + bscale(acc, accZero##iter, pAlpha); \ + res.template storePacketBlock(row + iter*accCols, 0, acc); \ } #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) -#define MICRO_COL_STORE_ONE(iter) \ - if (unroll_factor > iter) { \ - acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ - bscale(acc, accZero##iter, pAlpha); \ - res.template storePacketBlock(row + iter*accCols, col, acc); \ - } - -#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE) - template EIGEN_STRONG_INLINE void gemm_unrolled_iteration( const DataMapper& res, @@ -1564,15 +1611,13 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index& row, - Index col, const Packet& pAlpha) { const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; - PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; - PacketBlock acc; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + PacketBlock acc; MICRO_SRC_PTR MICRO_DST_PTR @@ -1593,101 +1638,100 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( row += unroll_factor*accCols; } -template -EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration( +template +EIGEN_ALWAYS_INLINE void gemm_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, - Index& row, + Index strideB, + Index offsetB, Index col, - Index remaining_cols, - const Packet& pAlpha) + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) { - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, *lhs_ptr7 = NULL; - PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; - PacketBlock acc; - - MICRO_SRC_PTR - MICRO_DST_PTR - - Index k = 0; - for(; k + PEEL <= depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - MICRO_PREFETCH - MICRO_ONE_PEEL1 - } - for(; k < depth; k++) - { - MICRO_ONE1 - } - MICRO_COL_STORE + const DataMapper res3 = res.getSubMapper(0, col); - row += unroll_factor*accCols; -} + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; -template -EIGEN_STRONG_INLINE void gemm_unrolled_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index& row, - Index rows, - Index col, - Index remaining_cols, - const Packet& pAlpha) -{ #define MAX_UNROLL 6 while(row + MAX_UNROLL*accCols <= rows) { - gemm_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); } switch( (rows-row)/accCols ) { #if MAX_UNROLL > 7 case 7: - gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 6 case 6: - gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 5 - case 5: - gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + case 5: + gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 4 - case 4: - gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + case 4: + gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 3 - case 3: - gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 3: + gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_UNROLL > 2 - case 2: - gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 2: + gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_UNROLL > 1 - case 1: - gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 1: + gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif - default: - break; + default: + break; } #undef MAX_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } +} + +template +EIGEN_STRONG_INLINE void gemm_extra_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + for (; col < cols; col++) { + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); + } } /**************** @@ -1697,7 +1741,6 @@ template(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - } - switch( (rows-row)/accCols ) { -#if MAX_UNROLL > 7 - case 7: - gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 6 - case 6: - gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 5 - case 5: - gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 4 - case 4: - gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 3 - case 3: - gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 2 - case 2: - gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 1 - case 1: - gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif - default: - break; - } -#undef MAX_UNROLL - - if(remaining_rows > 0) - { - gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); - } - } - - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); - - if (remaining_rows > 0) - { - gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); - } - rhs_base++; + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } - } + + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } #define accColsC (accCols / 2) @@ -1789,117 +1763,66 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const // PEEL_COMPLEX loop factor. #define PEEL_COMPLEX 3 +#define PEEL_COMPLEX_ROW 3 -template -EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_COL( - const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, - const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, - PacketBlock &accReal, PacketBlock &accImag, - Index remaining_rows, - Index remaining_cols) -{ - Packet rhsV[1], rhsVi[1]; - rhsV[0] = pset1(rhs_ptr_real[0]); - if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); - pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - rhs_ptr_real += remaining_cols; - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); -} +#define MICRO_COMPLEX_UNROLL_PEEL(func) \ + func(0) func(1) func(2) func(3) -template -EIGEN_STRONG_INLINE void gemm_complex_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index strideB, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlphaReal, - const Packet& pAlphaImag) -{ - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; - const Scalar* lhs_ptr_imag; - if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; - else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - PacketBlock accReal, accImag; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - - bsetzero(accReal); - bsetzero(accImag); - - Index remaining_depth = (depth & -accRows); - Index k = 0; - for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - EIGEN_POWER_PREFETCH(lhs_ptr_real); - if(!LhsIsReal) { - EIGEN_POWER_PREFETCH(lhs_ptr_imag); - } - for (int l = 0; l < PEEL_COMPLEX; l++) { - MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); - } - } - for(; k < remaining_depth; k++) - { - MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); +#define MICRO_COMPLEX_ZERO_PEEL(peel) \ + if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \ + bsetzero(accReal##peel); \ + bsetzero(accImag##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accReal##peel); \ + EIGEN_UNUSED_VARIABLE(accImag##peel); \ } - for(; k < depth; k++) - { - Packet rhsV[1], rhsVi[1]; - rhsV[0] = pset1(rhs_ptr_real[0]); - if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); - pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - rhs_ptr_real += remaining_cols; - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; +#define MICRO_COMPLEX_ZERO_PEEL_ROW \ + MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_ZERO_PEEL); + +#define MICRO_COMPLEX_WORK_PEEL(peel) \ + if (PEEL_COMPLEX_ROW > peel) { \ + pbroadcastN_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ + pgerc(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); - bcouple_common(taccReal, taccImag, acc0, acc1); +#define MICRO_COMPLEX_WORK_PEEL_ROW \ + Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \ + Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \ + MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_WORK_PEEL); \ + lhs_ptr_real += (remaining_rows * PEEL_COMPLEX_ROW); \ + if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * PEEL_COMPLEX_ROW); \ + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); \ + rhs_ptr_real += (accRows * PEEL_COMPLEX_ROW); \ + if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_ROW); \ + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) - { - res(row + 0, col + 0) += pfirst(acc0.packet[0]); - } else { - acc0.packet[0] += res.template loadPacket(row + 0, col + 0); - res.template storePacketBlock(row + 0, col + 0, acc0); - if(remaining_rows > accColsC) { - res(row + accColsC, col + 0) += pfirst(acc1.packet[0]); - } +#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \ + if (PEEL_COMPLEX_ROW > peel) { \ + for (Index i = 0; i < accRows; i++) { \ + accReal##sum.packet[i] += accReal##peel.packet[i]; \ + accImag##sum.packet[i] += accImag##peel.packet[i]; \ + } \ } -} -template +#define MICRO_COMPLEX_ADD_PEEL_ROW \ + MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \ + MICRO_COMPLEX_ADD_PEEL(1, 0) + +template EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, - PacketBlock &accReal, PacketBlock &accImag, - Index remaining_rows) + PacketBlock &accReal, PacketBlock &accImag) { Packet rhsV[4], rhsVi[4]; - pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); - pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + pbroadcastN_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); lhs_ptr_real += remaining_rows; if(!LhsIsReal) lhs_ptr_imag += remaining_rows; else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); @@ -1908,8 +1831,8 @@ EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } -template -EIGEN_STRONG_INLINE void gemm_complex_extra_row( +template +EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -1921,106 +1844,141 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( Index col, Index rows, Index cols, - Index remaining_rows, const Packet& pAlphaReal, const Packet& pAlphaImag, const Packet& pMask) { const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; + const Scalar* rhs_ptr_imag = NULL; if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; - const Scalar* lhs_ptr_imag; + const Scalar* lhs_ptr_imag = NULL; if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - PacketBlock accReal, accImag; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; + PacketBlock accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; - bsetzero(accReal); - bsetzero(accImag); + bsetzero(accReal0); + bsetzero(accImag0); - Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); + Index remaining_depth = (col + quad_traits::rows < cols) ? depth : (depth & -quad_traits::rows); Index k = 0; - for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - EIGEN_POWER_PREFETCH(lhs_ptr_real); - if(!LhsIsReal) { - EIGEN_POWER_PREFETCH(lhs_ptr_imag); - } - for (int l = 0; l < PEEL_COMPLEX; l++) { - MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); - } + if (remaining_depth >= PEEL_COMPLEX_ROW) { + MICRO_COMPLEX_ZERO_PEEL_ROW + do + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + EIGEN_POWER_PREFETCH(lhs_ptr_real); + if(!LhsIsReal) { + EIGEN_POWER_PREFETCH(lhs_ptr_imag); + } + MICRO_COMPLEX_WORK_PEEL_ROW + } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth); + MICRO_COMPLEX_ADD_PEEL_ROW } for(; k < remaining_depth; k++) { - MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); + MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal0, accImag0); } if ((remaining_depth == depth) && (rows >= accCols)) { - bload(tRes, res, row, col); - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); - bcouple(taccReal, taccImag, tRes, acc0, acc1); - res.template storePacketBlock(row + 0, col, acc0); - res.template storePacketBlock(row + accColsC, col, acc1); + bload(tRes, res, row, 0); + bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); + bcouple(taccReal, taccImag, tRes, acc0, acc1); + res.template storePacketBlock(row + 0, 0, acc0); + res.template storePacketBlock(row + accColsC, 0, acc1); } else { for(; k < depth; k++) { Packet rhsV[4], rhsVi[4]; - pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); - pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); + pbroadcastN_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc(&accReal0, &accImag0, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); lhs_ptr_real += remaining_rows; if(!LhsIsReal) lhs_ptr_imag += remaining_rows; rhs_ptr_real += accRows; if(!RhsIsReal) rhs_ptr_imag += accRows; } - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); - bcouple_common(taccReal, taccImag, acc0, acc1); + bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag); + bcouple_common(taccReal, taccImag, acc0, acc1); if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) { - for(Index j = 0; j < 4; j++) { - res(row + 0, col + j) += pfirst(acc0.packet[j]); + for(Index j = 0; j < accRows; j++) { + res(row + 0, j) += pfirst(acc0.packet[j]); } } else { - for(Index j = 0; j < 4; j++) { + for(Index j = 0; j < accRows; j++) { PacketBlock acc2; - acc2.packet[0] = res.template loadPacket(row + 0, col + j) + acc0.packet[j]; - res.template storePacketBlock(row + 0, col + j, acc2); + acc2.packet[0] = res.template loadPacket(row + 0, j) + acc0.packet[j]; + res.template storePacketBlock(row + 0, j, acc2); if(remaining_rows > accColsC) { - res(row + accColsC, col + j) += pfirst(acc1.packet[j]); + res(row + accColsC, j) += pfirst(acc1.packet[j]); } } } } } +template +EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + switch(remaining_rows) { + case 1: + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); + break; + case 2: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); + } + break; + default: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); + } + break; + } +} + #define MICRO_COMPLEX_UNROLL(func) \ - func(0) func(1) func(2) func(3) func(4) + func(0) func(1) func(2) func(3) #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ MICRO_COMPLEX_UNROLL(func2); \ - func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel) + func(0,peel) func(1,peel) func(2,peel) func(3,peel) #define MICRO_COMPLEX_LOAD_ONE(iter) \ if (unroll_factor > iter) { \ lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ - lhs_ptr_real##iter += accCols; \ if(!LhsIsReal) { \ - lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ - lhs_ptr_imag##iter += accCols; \ + lhsVi##iter = ploadLhs(lhs_ptr_real##iter + imag_delta); \ } else { \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ } \ + lhs_ptr_real##iter += accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhsV##iter); \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ @@ -2028,37 +1986,16 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \ if (unroll_factor > iter) { \ - pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ - } - -#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \ - if (unroll_factor > iter) { \ - pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + pgerc_common(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ } #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \ if (PEEL_COMPLEX > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ - pbroadcast4_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + Packet lhsV0, lhsV1, lhsV2, lhsV3; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ + pbroadcastN_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ if(!RhsIsReal) { \ - pbroadcast4_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ - } \ - MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ - } - -#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \ - if (PEEL_COMPLEX > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ - rhsV##peel[0] = pset1(rhs_ptr_real[remaining_cols * peel]); \ - if(!RhsIsReal) { \ - rhsVi##peel[0] = pset1(rhs_ptr_imag[remaining_cols * peel]); \ + pbroadcastN_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ } else { \ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } \ @@ -2069,13 +2006,10 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( } #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \ - Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ - Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \ + Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \ func(func1,func2,0); func(func1,func2,1); \ - func(func1,func2,2); func(func1,func2,3); \ - func(func1,func2,4); func(func1,func2,5); \ - func(func1,func2,6); func(func1,func2,7); \ - func(func1,func2,8); func(func1,func2,9); + func(func1,func2,2); func(func1,func2,3); #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \ Packet rhsV0[M], rhsVi0[M];\ @@ -2091,20 +2025,10 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( rhs_ptr_real += accRows; \ if(!RhsIsReal) rhs_ptr_imag += accRows; -#define MICRO_COMPLEX_ONE_PEEL1 \ - MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \ - if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX); - -#define MICRO_COMPLEX_ONE1 \ - MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += remaining_cols; \ - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; - #define MICRO_COMPLEX_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzero(accReal##iter); \ - bsetzero(accImag##iter); \ + bsetzero(accReal##iter); \ + bsetzero(accImag##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accReal##iter); \ EIGEN_UNUSED_VARIABLE(accImag##iter); \ @@ -2114,15 +2038,9 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( #define MICRO_COMPLEX_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ - if(!LhsIsReal) { \ - lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ } #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE) @@ -2130,35 +2048,21 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( #define MICRO_COMPLEX_PREFETCH_ONE(iter) \ if (unroll_factor > iter) { \ EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ - if(!LhsIsReal) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ - } \ } #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE) #define MICRO_COMPLEX_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - bload(tRes, res, row + iter*accCols, col); \ - bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ - bcouple(taccReal, taccImag, tRes, acc0, acc1); \ - res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ - res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ + bload(tRes, res, row + iter*accCols, 0); \ + bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ + bcouple(taccReal, taccImag, tRes, acc0, acc1); \ + res.template storePacketBlock(row + iter*accCols + 0, 0, acc0); \ + res.template storePacketBlock(row + iter*accCols + accColsC, 0, acc1); \ } #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE) -#define MICRO_COMPLEX_COL_STORE_ONE(iter) \ - if (unroll_factor > iter) { \ - bload(tRes, res, row + iter*accCols, col); \ - bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ - bcouple(taccReal, taccImag, tRes, acc0, acc1); \ - res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ - res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ - } - -#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE) - template EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( const DataMapper& res, @@ -2166,29 +2070,26 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index strideB, Index& row, - Index col, const Packet& pAlphaReal, const Packet& pAlphaImag) { const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; + const Scalar* rhs_ptr_imag = NULL; + const Index imag_delta = accCols*strideA; if(!RhsIsReal) { rhs_ptr_imag = rhs_base + accRows*strideB; } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; - const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; - const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; - PacketBlock accReal0, accImag0, accReal1, accImag1; - PacketBlock accReal2, accImag2, accReal3, accImag3; - PacketBlock accReal4, accImag4; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; + PacketBlock accReal0, accImag0, accReal1, accImag1; + PacketBlock accReal2, accImag2, accReal3, accImag3; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_DST_PTR @@ -2212,112 +2113,93 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( row += unroll_factor*accCols; } -template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration( +template +EIGEN_ALWAYS_INLINE void gemm_complex_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index& row, + Index offsetB, Index col, - Index remaining_cols, + Index rows, + Index cols, + Index remaining_rows, const Packet& pAlphaReal, - const Packet& pAlphaImag) + const Packet& pAlphaImag, + const Packet& pMask) { - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) { - rhs_ptr_imag = rhs_base + remaining_cols*strideB; - } else { - EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - } - const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; - const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; - const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; - PacketBlock accReal0, accImag0, accReal1, accImag1; - PacketBlock accReal2, accImag2, accReal3, accImag3; - PacketBlock accReal4, accImag4; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; + const DataMapper res3 = res.getSubMapper(0, col); - MICRO_COMPLEX_SRC_PTR - MICRO_COMPLEX_DST_PTR + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; - Index k = 0; - for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - MICRO_COMPLEX_PREFETCH - MICRO_COMPLEX_ONE_PEEL1 +#define MAX_COMPLEX_UNROLL 3 + while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { + gemm_complex_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); } - for(; k < depth; k++) - { - MICRO_COMPLEX_ONE1 + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_UNROLL > 4 + case 4: + gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 3 + case 3: + gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 2 + case 2: + gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 1 + case 1: + gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; } - MICRO_COMPLEX_COL_STORE +#undef MAX_COMPLEX_UNROLL - row += unroll_factor*accCols; + if(remaining_rows > 0) + { + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } } template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( +EIGEN_STRONG_INLINE void gemm_complex_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index& row, - Index rows, + Index offsetB, Index col, - Index remaining_cols, + Index rows, + Index cols, + Index remaining_rows, const Packet& pAlphaReal, - const Packet& pAlphaImag) + const Packet& pAlphaImag, + const Packet& pMask) { -#define MAX_COMPLEX_UNROLL 3 - while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { - gemm_complex_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_UNROLL > 4 - case 4: - gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 3 - case 3: - gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 2 - case 2: - gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 1 - case 1: - gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; + for (; col < cols; col++) { + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } -#undef MAX_COMPLEX_UNROLL } template EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; @@ -2332,64 +2214,10 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - Index row = 0; - -#define MAX_COMPLEX_UNROLL 3 - while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { - gemm_complex_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_UNROLL > 4 - case 4: - gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 3 - case 3: - gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 2 - case 2: - gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 1 - case 1: - gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; - } -#undef MAX_COMPLEX_UNROLL - - if(remaining_rows > 0) - { - gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); - } + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); - - if (remaining_rows > 0) - { - gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); - } - rhs_base++; - } - } + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } #undef accColsC diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index 33d543494..bf01dba1c 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -9,22 +9,8 @@ namespace Eigen { namespace internal { -template -EIGEN_STRONG_INLINE void gemm_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlpha); - template -EIGEN_STRONG_INLINE void gemm_extra_row( +EIGEN_ALWAYS_INLINE void gemm_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -39,41 +25,28 @@ EIGEN_STRONG_INLINE void gemm_extra_row( const Packet& pAlpha, const Packet& pMask); -template -EIGEN_STRONG_INLINE void gemm_unrolled_col( +template +EIGEN_STRONG_INLINE void gemm_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, - Index& row, - Index rows, + Index strideB, + Index offsetB, Index col, - Index remaining_cols, - const Packet& pAlpha); + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask); template EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); template -EIGEN_STRONG_INLINE void gemm_complex_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index strideB, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlphaReal, - const Packet& pAlphaImag); - -template -EIGEN_STRONG_INLINE void gemm_complex_extra_row( +EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -91,123 +64,88 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( const Packet& pMask); template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( +EIGEN_STRONG_INLINE void gemm_complex_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index& row, - Index rows, + Index offsetB, Index col, - Index remaining_cols, + Index rows, + Index cols, + Index remaining_rows, const Packet& pAlphaReal, - const Packet& pAlphaImag); + const Packet& pAlphaImag, + const Packet& pMask); template EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); -template -EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); - -template -EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); template EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); -const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, - 16, 17, 18, 19, - 4, 5, 6, 7, - 20, 21, 22, 23}; - -const static Packet16uc p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11, - 24, 25, 26, 27, - 12, 13, 14, 15, - 28, 29, 30, 31}; -//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64 -const static Packet16uc p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7, - 16, 17, 18, 19, 20, 21, 22, 23}; - -//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64 -const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15, - 24, 25, 26, 27, 28, 29, 30, 31}; - - // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. -template -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND); -} - -template -EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) -{ - bcouple_common(taccReal, taccImag, acc1, acc2); - - acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); - acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); - acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); - acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); - - acc2.packet[0] = padd(tRes.packet[4], acc2.packet[0]); - acc2.packet[1] = padd(tRes.packet[5], acc2.packet[1]); - acc2.packet[2] = padd(tRes.packet[6], acc2.packet[2]); - acc2.packet[3] = padd(tRes.packet[7], acc2.packet[3]); -} - -template -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +template +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); + acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]); + if (N > 1) { + acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]); + } + if (N > 2) { + acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]); + } + if (N > 3) { + acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]); + } + + acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]); + if (N > 1) { + acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]); + } + if (N > 2) { + acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]); + } + if (N > 3) { + acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]); + } } -template -EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +template +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) { - bcouple_common(taccReal, taccImag, acc1, acc2); + bcouple_common(taccReal, taccImag, acc1, acc2); acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); - - acc2.packet[0] = padd(tRes.packet[1], acc2.packet[0]); -} - -template<> -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND); -} - -template<> -EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); + if (N > 1) { + acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); + } + if (N > 2) { + acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); + } + if (N > 3) { + acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); + } + + acc2.packet[0] = padd(tRes.packet[0+N], acc2.packet[0]); + if (N > 1) { + acc2.packet[1] = padd(tRes.packet[1+N], acc2.packet[1]); + } + if (N > 2) { + acc2.packet[2] = padd(tRes.packet[2+N], acc2.packet[2]); + } + if (N > 3) { + acc2.packet[3] = padd(tRes.packet[3+N], acc2.packet[3]); + } } // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 6540c6fa6..5b4449537 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -11,7 +11,7 @@ #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H -#pragma GCC target("cpu=power10") +#pragma GCC target("cpu=power10,htm") #ifdef __has_builtin #if !__has_builtin(__builtin_vsx_assemble_pair) @@ -30,37 +30,37 @@ EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) } template -EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc) { PacketBlock result; __builtin_mma_disassemble_acc(&result.packet, acc); PacketBlock tRes; - bload(tRes, data, i, j); + bload(tRes, data, i, 0); - bscale(tRes, result, alpha); + bscale(tRes, result, alpha); - data.template storePacketBlock(i, j, tRes); + data.template storePacketBlock(i, 0, tRes); } -template -EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) +template +EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) { PacketBlock resultReal, resultImag; __builtin_mma_disassemble_acc(&resultReal.packet, accReal); __builtin_mma_disassemble_acc(&resultImag.packet, accImag); PacketBlock tRes; - bload(tRes, data, i, j); + bload(tRes, data, i, 0); PacketBlock taccReal, taccImag; bscalec(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag); PacketBlock acc1, acc2; - bcouple(taccReal, taccImag, tRes, acc1, acc2); + bcouple(taccReal, taccImag, tRes, acc1, acc2); - data.template storePacketBlock(i + N*accColsC, j, acc1); - data.template storePacketBlock(i + (N+1)*accColsC, j, acc2); + data.template storePacketBlock(i, 0, acc1); + data.template storePacketBlock(i + accColsC, 0, acc2); } // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments @@ -125,7 +125,7 @@ EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag template EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) { - rhsV = ploadRhs((const Scalar*)(rhs)); + rhsV = ploadRhs(rhs); } template<> @@ -184,12 +184,11 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) } #define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ + type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \ MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \ MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \ MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9); + MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); #define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \ type rhsV0; \ @@ -222,7 +221,7 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) #define MICRO_MMA_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ } @@ -238,21 +237,19 @@ EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) #define MICRO_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeAccumulator(row + iter*accCols, col, res, pAlpha, &accZero##iter); \ + storeAccumulator(row + iter*accCols, res, pAlpha, &accZero##iter); \ } #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) template -EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( +EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index& row, - Index col, const Packet& pAlpha) { const Scalar* rhs_ptr = rhs_base; @@ -278,94 +275,98 @@ EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( row += unroll_factor*accCols; } -template -void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +template +EIGEN_ALWAYS_INLINE void gemmMMA_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) { - const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; - - if( strideA == -1 ) strideA = depth; - if( strideB == -1 ) strideB = depth; - - const Packet pAlpha = pset1(alpha); - const Packet pMask = bmask((const int)(remaining_rows)); + const DataMapper res3 = res.getSubMapper(0, col); - Index col = 0; - for(; col + accRows <= cols; col += accRows) - { - const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; - Index row = 0; #define MAX_MMA_UNROLL 7 - while(row + MAX_MMA_UNROLL*accCols <= rows) { - gemm_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - } - switch( (rows-row)/accCols ) { + while(row + MAX_MMA_UNROLL*accCols <= rows) { + gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + } + switch( (rows-row)/accCols ) { #if MAX_MMA_UNROLL > 7 - case 7: - gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 7: + gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 6 - case 6: - gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 6: + gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 5 - case 5: - gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 5: + gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 4 - case 4: - gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 4: + gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 3 - case 3: - gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 3: + gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 2 - case 2: - gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 2: + gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 1 - case 1: - gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 1: + gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif - default: - break; - } + default: + break; + } #undef MAX_MMA_UNROLL - if(remaining_rows > 0) - { - gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); - } - } + if(remaining_rows > 0) + { + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } +} - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; +template +void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; - for(; col < cols; col++) - { - Index row = 0; + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; - gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); + const Packet pAlpha = pset1(alpha); + const Packet pMask = bmask((const int)(remaining_rows)); - if (remaining_rows > 0) - { - gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); - } - rhs_base++; - } + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + gemmMMA_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } + + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } #define accColsC (accCols / 2) @@ -373,21 +374,20 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define advanceCols ((RhsIsReal) ? 1 : 2) // PEEL_COMPLEX_MMA loop factor. -#define PEEL_COMPLEX_MMA 7 +#define PEEL_COMPLEX_MMA 3 #define MICRO_COMPLEX_MMA_UNROLL(func) \ - func(0) func(1) func(2) func(3) func(4) + func(0) func(1) func(2) func(3) #define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \ if (unroll_factor > iter) { \ lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ - lhs_ptr_real##iter += accCols; \ if(!LhsIsReal) { \ - lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ - lhs_ptr_imag##iter += accCols; \ + lhsVi##iter = ploadLhs(lhs_ptr_real##iter + imag_delta); \ } else { \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ } \ + lhs_ptr_real##iter += accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhsV##iter); \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ @@ -400,8 +400,8 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \ if (PEEL_COMPLEX_MMA > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ + Packet lhsV0, lhsV1, lhsV2, lhsV3; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \ if(!RhsIsReal) { \ ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \ @@ -409,20 +409,17 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } \ MICRO_COMPLEX_MMA_UNROLL(func2); \ - func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \ + func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \ } else { \ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ - type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \ + type rhsV0, rhsV1, rhsV2, rhsV3; \ + type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \ MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9); + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \ type rhsV0, rhsVi0; \ @@ -459,15 +456,9 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ - if(!LhsIsReal) { \ - lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ } #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE) @@ -475,45 +466,40 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \ if (unroll_factor > iter) { \ EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ - if(!LhsIsReal) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ - } \ } #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE) #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeComplexAccumulator(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ + storeComplexAccumulator(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ } #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE) template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( +EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index strideB, Index& row, - Index col, const Packet& pAlphaReal, const Packet& pAlphaImag) { const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; + const Scalar* rhs_ptr_imag = NULL; + const Index imag_delta = accCols*strideA; if(!RhsIsReal) { rhs_ptr_imag = rhs_base + accRows*strideB; } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_imag0 = NULL, * lhs_ptr_real1 = NULL, * lhs_ptr_imag1 = NULL; - const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_imag2 = NULL, * lhs_ptr_real3 = NULL, * lhs_ptr_imag3 = NULL; - const Scalar* lhs_ptr_real4 = NULL, * lhs_ptr_imag4 = NULL; - __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; + __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_DST_PTR @@ -537,11 +523,70 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( row += unroll_factor*accCols; } +template +EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + +#define MAX_COMPLEX_MMA_UNROLL 4 + while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { + gemm_complex_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_MMA_UNROLL > 4 + case 4: + gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 3 + case 3: + gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 2 + case 2: + gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 1 + case 1: + gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } +} + template void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; @@ -556,64 +601,10 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - Index row = 0; - -#define MAX_COMPLEX_MMA_UNROLL 4 - while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { - gemm_complex_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_MMA_UNROLL > 4 - case 4: - gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 3 - case 3: - gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 2 - case 2: - gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 1 - case 1: - gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; - } -#undef MAX_COMPLEX_MMA_UNROLL - - if(remaining_rows > 0) - { - gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); - } + gemmMMA_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); - - if (remaining_rows > 0) - { - gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); - } - rhs_base++; - } - } + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } #undef accColsC -- GitLab From df53e28179017785add9b474582f4975b8debe91 Mon Sep 17 00:00:00 2001 From: Lennart Steffen Date: Fri, 22 Oct 2021 09:46:43 +0000 Subject: [PATCH 132/266] Included note on inner stride for compile-time vectors. See https://gitlab.com/libeigen/eigen/-/issues/2355#note_711078126 (cherry picked from commit 163f11e24a1011ac8ba1cecfaf53e9b11ace5f5c) --- Eigen/src/Core/Stride.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h index 6494d5142..d164e5399 100644 --- a/Eigen/src/Core/Stride.h +++ b/Eigen/src/Core/Stride.h @@ -38,10 +38,14 @@ namespace Eigen { * \include Map_general_stride.cpp * Output: \verbinclude Map_general_stride.out * - * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime + * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were * not allowed). * + * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1), + * the inner stride is the pointer increment between two consecutive elements, + * regardless of storage layout. + * * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders */ template -- GitLab From 6f57470bcca7520117b8a85eec5206f4c9ed2450 Mon Sep 17 00:00:00 2001 From: Gengxin Xie Date: Thu, 4 Nov 2021 16:10:57 +0800 Subject: [PATCH 133/266] Bug Fix: correct the bug that won't define EIGEN_HAS_FP16_C if the compiler isn't clang (cherry picked from commit 5c642950a5d0232ebdfa6b460b4515e6e2b8828d) --- Eigen/src/Core/util/ConfigureVectorization.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index af4e69623..73e8a65a5 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -438,11 +438,11 @@ #include #endif -#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380)) +#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380)) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C - #if defined(EIGEN_COMP_CLANG) + #if EIGEN_COMP_CLANG // Workaround for clang: The FP16C intrinsics for clang are included by // immintrin.h, as opposed to emmintrin.h as suggested by Intel: // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 -- GitLab From c379a21191d3b636d50fad3a54f9a31f604aa751 Mon Sep 17 00:00:00 2001 From: Minh Quan HO Date: Mon, 4 Oct 2021 18:30:26 +0200 Subject: [PATCH 134/266] nestbyvalue test: fix uninitialized matrix - Doing computation with uninitialized (zero-ed ? but thanks Linux) matrix, or worse NaN on other non-linux systems. - This commit fixes it by initializing to Random(). (cherry picked from commit 4284c68fbb81cb069a630ae1bf4a953ee922f6e5) --- test/nestbyvalue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/nestbyvalue.cpp b/test/nestbyvalue.cpp index c5356bc24..3a86bea50 100644 --- a/test/nestbyvalue.cpp +++ b/test/nestbyvalue.cpp @@ -26,7 +26,7 @@ EIGEN_DECLARE_TEST(nestbyvalue) for(int i = 0; i < g_repeat; i++) { Index rows = internal::random(1,EIGEN_TEST_MAX_SIZE); Index cols = internal::random(1,EIGEN_TEST_MAX_SIZE); - MatrixXd a = MatrixXd(rows,cols); + MatrixXd a = MatrixXd::Random(rows,cols); nb_temporaries = 0; XprType x = get_xpr_with_temps(a); VERIFY_IS_EQUAL(nb_temporaries,6); -- GitLab From 7e3bc4177e6ab4c6d4e16dfabca5c6f9290d8441 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Mon, 15 Nov 2021 22:19:04 -0800 Subject: [PATCH 135/266] Fix tensor broadcast off-by-one error. Caught by JAX unit tests. Triggered if broadcast is smaller than packet size. (cherry picked from commit ffb78e23a1b3bc232a07773144cfa5fa1759852d) --- unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index 8d8ad2658..7449b046b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -461,8 +461,8 @@ struct TensorEvaluator, Device> values[i] = m_impl.coeff(inputIndex); ++outputOffset; } else { - outputOffset = 0; values[i] = m_impl.coeff(++inputIndex); + outputOffset = 1; // Next offset. } } return internal::pload(values); -- GitLab From 3af8c262acdfb50fbfc88e6c640d5d1a582145a9 Mon Sep 17 00:00:00 2001 From: David Tellenbach Date: Sat, 25 Dec 2021 19:51:42 +0000 Subject: [PATCH 136/266] Include immintrin.h if F16C is available and vectorization is disabled If EIGEN_DONT_VECTORIZE is defined, immintrin.h is not included even if F16C is available. Trying to use F16C intrinsics thus fails. This fixes issue #2395. (cherry picked from commit c06c3e52a082e403e7a241350fd867e907c833dc) --- Eigen/src/Core/util/ConfigureVectorization.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 73e8a65a5..2d12e1d17 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -442,9 +442,11 @@ // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C - #if EIGEN_COMP_CLANG - // Workaround for clang: The FP16C intrinsics for clang are included by - // immintrin.h, as opposed to emmintrin.h as suggested by Intel: + #if EIGEN_COMP_GNUC + // Make sure immintrin.h is included, even if e.g. vectorization is + // explicitly disabled (see also issue #2395). + // Note that FP16C intrinsics for gcc and clang are included by immintrin.h, + // as opposed to emmintrin.h as suggested by Intel: // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 #include #endif -- GitLab From bd72e4a8c4f2bc41d3b4a87be7d5e850efaa461d Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Wed, 5 Jan 2022 00:46:09 +0000 Subject: [PATCH 137/266] ensure that eigen::internal::size is not found by ADL, rename to ssize and... (cherry picked from commit 9210e71fb378a0f1542272506dc2759b6c147237) --- Eigen/src/Core/IndexedView.h | 4 ++-- Eigen/src/Core/util/Meta.h | 25 +++++++++++++++++++------ Eigen/src/plugins/IndexedViewMethods.h | 6 +++--- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 08476251d..9d41c391f 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -122,10 +122,10 @@ public: {} /** \returns number of rows */ - Index rows() const { return internal::size(m_rowIndices); } + Index rows() const { return internal::index_list_size(m_rowIndices); } /** \returns number of columns */ - Index cols() const { return internal::size(m_colIndices); } + Index cols() const { return internal::index_list_size(m_colIndices); } /** \returns the nested expression */ const typename internal::remove_all::type& diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 81ae2a32d..0e21fe37f 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -466,20 +466,33 @@ template struct array_size > { }; #endif + /** \internal - * Analogue of the std::size free function. - * It returns the size of the container or view \a x of type \c T + * Analogue of the std::ssize free function. + * It returns the signed size of the container or view \a x of type \c T * * It currently supports: * - any types T defining a member T::size() const * - plain C arrays as T[N] * + * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function. */ -template -EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); } +#if EIGEN_COMP_CXXVER < 20 +template +EIGEN_CONSTEXPR auto index_list_size(const T& x) { + using R = std::common_type_t>; + return static_cast(x.size()); +} -template -EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; } +template +EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { return N; } +#else +template +EIGEN_CONSTEXPR auto index_list_size(T&& x) { + using std::ssize; + return ssize(std::forward(x)); +} +#endif // EIGEN_COMP_CXXVER /** \internal * Convenient struct to get the result type of a nullary, unary, binary, or diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 5bfb19ac6..15c35b0bf 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -90,8 +90,8 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND return BlockType(derived(), internal::first(actualRowIndices), internal::first(actualColIndices), - internal::size(actualRowIndices), - internal::size(actualColIndices)); + internal::index_list_size(actualRowIndices), + internal::index_list_size(actualColIndices)); } // The following overload returns a Scalar @@ -168,7 +168,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) typename IvcType::type actualIndices = ivcSize(indices); return VectorBlock::value> - (derived(), internal::first(actualIndices), internal::size(actualIndices)); + (derived(), internal::first(actualIndices), internal::index_list_size(actualIndices)); } template -- GitLab From b5d218d8574c665d7d4ac3ac21f7bf15b8dabbfa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Buchwald?= Date: Thu, 13 Jan 2022 01:24:20 +0100 Subject: [PATCH 138/266] fix compilation issue with gcc < 10 and -std=c++2a (cherry picked from commit d1bf05639476adbb222e8d09da2faf6ef1ed66f2) --- Eigen/src/Core/util/Meta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 0e21fe37f..b67f50d0e 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -477,7 +477,7 @@ template struct array_size > { * * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function. */ -#if EIGEN_COMP_CXXVER < 20 +#if EIGEN_COMP_CXXVER < 20 || EIGEN_GNUC_AT_MOST(9,4) template EIGEN_CONSTEXPR auto index_list_size(const T& x) { using R = std::common_type_t>; -- GitLab From 3e71c621c9669a89e067bef52433deaac06e3b99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 18 Jan 2022 16:08:37 +0000 Subject: [PATCH 139/266] Revert "fix compilation issue with gcc < 10 and -std=c++2a" This reverts commit b5d218d8574c665d7d4ac3ac21f7bf15b8dabbfa --- Eigen/src/Core/util/Meta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index b67f50d0e..0e21fe37f 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -477,7 +477,7 @@ template struct array_size > { * * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function. */ -#if EIGEN_COMP_CXXVER < 20 || EIGEN_GNUC_AT_MOST(9,4) +#if EIGEN_COMP_CXXVER < 20 template EIGEN_CONSTEXPR auto index_list_size(const T& x) { using R = std::common_type_t>; -- GitLab From 46126273552afe13692929523d34006f54c19719 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 18 Jan 2022 16:08:59 +0000 Subject: [PATCH 140/266] Revert "ensure that eigen::internal::size is not found by ADL, rename to ssize and..." This reverts commit bd72e4a8c4f2bc41d3b4a87be7d5e850efaa461d --- Eigen/src/Core/IndexedView.h | 4 ++-- Eigen/src/Core/util/Meta.h | 25 ++++++------------------- Eigen/src/plugins/IndexedViewMethods.h | 6 +++--- 3 files changed, 11 insertions(+), 24 deletions(-) diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 9d41c391f..08476251d 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -122,10 +122,10 @@ public: {} /** \returns number of rows */ - Index rows() const { return internal::index_list_size(m_rowIndices); } + Index rows() const { return internal::size(m_rowIndices); } /** \returns number of columns */ - Index cols() const { return internal::index_list_size(m_colIndices); } + Index cols() const { return internal::size(m_colIndices); } /** \returns the nested expression */ const typename internal::remove_all::type& diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 0e21fe37f..81ae2a32d 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -466,33 +466,20 @@ template struct array_size > { }; #endif - /** \internal - * Analogue of the std::ssize free function. - * It returns the signed size of the container or view \a x of type \c T + * Analogue of the std::size free function. + * It returns the size of the container or view \a x of type \c T * * It currently supports: * - any types T defining a member T::size() const * - plain C arrays as T[N] * - * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function. */ -#if EIGEN_COMP_CXXVER < 20 -template -EIGEN_CONSTEXPR auto index_list_size(const T& x) { - using R = std::common_type_t>; - return static_cast(x.size()); -} +template +EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); } -template -EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { return N; } -#else -template -EIGEN_CONSTEXPR auto index_list_size(T&& x) { - using std::ssize; - return ssize(std::forward(x)); -} -#endif // EIGEN_COMP_CXXVER +template +EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; } /** \internal * Convenient struct to get the result type of a nullary, unary, binary, or diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 15c35b0bf..5bfb19ac6 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -90,8 +90,8 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND return BlockType(derived(), internal::first(actualRowIndices), internal::first(actualColIndices), - internal::index_list_size(actualRowIndices), - internal::index_list_size(actualColIndices)); + internal::size(actualRowIndices), + internal::size(actualColIndices)); } // The following overload returns a Scalar @@ -168,7 +168,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) typename IvcType::type actualIndices = ivcSize(indices); return VectorBlock::value> - (derived(), internal::first(actualIndices), internal::index_list_size(actualIndices)); + (derived(), internal::first(actualIndices), internal::size(actualIndices)); } template -- GitLab From 34e5f34b391e9b964917d54139aae1ecf522d4e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Mon, 21 Mar 2022 15:56:03 +0000 Subject: [PATCH 141/266] Update warning suppression to latest. --- Eigen/src/Core/util/DisableStupidWarnings.h | 90 +++++++++++++-------- 1 file changed, 57 insertions(+), 33 deletions(-) diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index e950749e7..0865fb698 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -1,7 +1,7 @@ #ifndef EIGEN_WARNINGS_DISABLED #define EIGEN_WARNINGS_DISABLED -#ifdef _MSC_VER +#if defined(_MSC_VER) // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p)) // 4101 - unreferenced local variable // 4127 - conditional expression is constant @@ -36,25 +36,28 @@ #pragma warning disable 2196 279 1684 2259 #elif defined __clang__ - // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant - // this is really a stupid warning as it warns on compile-time expressions involving enums #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma clang diagnostic push #endif - #pragma clang diagnostic ignored "-Wconstant-logical-operand" - #if __clang_major__ >= 3 && __clang_minor__ >= 5 - #pragma clang diagnostic ignored "-Wabsolute-value" - #endif - #if __clang_major__ >= 10 - #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" - #endif - #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L - // warning: generic selections are a C11-specific feature - // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h - #pragma clang diagnostic ignored "-Wc11-extensions" + #if defined(__has_warning) + // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant + // this is really a stupid warning as it warns on compile-time expressions involving enums + #if __has_warning("-Wconstant-logical-operand") + #pragma clang diagnostic ignored "-Wconstant-logical-operand" + #endif + #if __has_warning("-Wimplicit-int-float-conversion") + #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" + #endif + #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L + // warning: generic selections are a C11-specific feature + // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h + #if __has_warning("-Wc11-extensions") + #pragma clang diagnostic ignored "-Wc11-extensions" + #endif + #endif #endif -#elif defined __GNUC__ +#elif defined __GNUC__ && !defined(__FUJITSU) #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic push @@ -75,32 +78,53 @@ #endif #if defined __NVCC__ - #pragma diag_suppress boolean_controlling_expr_is_constant + // MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so + // we instead use Microsoft's __pragma extension. + #if defined _MSC_VER + #define EIGEN_MAKE_PRAGMA(X) __pragma(#X) + #else + #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X) + #endif + #if defined __NVCC_DIAG_PRAGMA_SUPPORT__ + #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X) + #else + #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X) + #endif + + EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant) // Disable the "statement is unreachable" message - #pragma diag_suppress code_is_unreachable + EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable) // Disable the "dynamic initialization in unreachable code" message - #pragma diag_suppress initialization_not_reachable + EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable) // Disable the "invalid error number" message that we get with older versions of nvcc - #pragma diag_suppress 1222 + EIGEN_NV_DIAG_SUPPRESS(1222) // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler) - #pragma diag_suppress 2527 - #pragma diag_suppress 2529 - #pragma diag_suppress 2651 - #pragma diag_suppress 2653 - #pragma diag_suppress 2668 - #pragma diag_suppress 2669 - #pragma diag_suppress 2670 - #pragma diag_suppress 2671 - #pragma diag_suppress 2735 - #pragma diag_suppress 2737 - #pragma diag_suppress 2739 - #pragma diag_suppress 2976 - #pragma diag_suppress 2979 + EIGEN_NV_DIAG_SUPPRESS(2527) + EIGEN_NV_DIAG_SUPPRESS(2529) + EIGEN_NV_DIAG_SUPPRESS(2651) + EIGEN_NV_DIAG_SUPPRESS(2653) + EIGEN_NV_DIAG_SUPPRESS(2668) + EIGEN_NV_DIAG_SUPPRESS(2669) + EIGEN_NV_DIAG_SUPPRESS(2670) + EIGEN_NV_DIAG_SUPPRESS(2671) + EIGEN_NV_DIAG_SUPPRESS(2735) + EIGEN_NV_DIAG_SUPPRESS(2737) + EIGEN_NV_DIAG_SUPPRESS(2739) + EIGEN_NV_DIAG_SUPPRESS(2885) + EIGEN_NV_DIAG_SUPPRESS(2888) + EIGEN_NV_DIAG_SUPPRESS(2976) + EIGEN_NV_DIAG_SUPPRESS(2979) + EIGEN_NV_DIAG_SUPPRESS(20011) + EIGEN_NV_DIAG_SUPPRESS(20014) // Disable the "// __device__ annotation is ignored on a function(...) that is // explicitly defaulted on its first declaration" message. // The __device__ annotation seems to actually be needed in some cases, // otherwise resulting in kernel runtime errors. - #pragma diag_suppress 2977 + EIGEN_NV_DIAG_SUPPRESS(2886) + EIGEN_NV_DIAG_SUPPRESS(2977) + EIGEN_NV_DIAG_SUPPRESS(20012) + #undef EIGEN_NV_DIAG_SUPPRESS + #undef EIGEN_MAKE_PRAGMA #endif #else -- GitLab From f3aaba870548da6be9e08ce7625d1be69715d989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Sun, 10 Apr 2022 15:30:33 +0000 Subject: [PATCH 142/266] Revert "Replace call to FixedDimensions() with a singleton instance of" This reverts commit 19e6496ce0c52fef33265bca54285ba77b2155be (cherry picked from commit f7b31f864c0dec7872038cab79f6e677de2ecc71) --- unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h | 2 +- unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h | 9 +++------ 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index ca39bb855..c7c1cfc72 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -61,7 +61,7 @@ class TensorFixedSize : public TensorBase class TensorStorage, Options_> -- GitLab From a1e1612c287dd68dc8c836e410499711ea92d822 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 15 Apr 2022 21:45:53 -0700 Subject: [PATCH 143/266] Fix cwise NaN propagation for scalar input. Was missing a template parameter. Updated tests. Fixes #2474. --- test/array_for_matrix.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp index 8086b3432..06e04a2fa 100644 --- a/test/array_for_matrix.cpp +++ b/test/array_for_matrix.cpp @@ -219,11 +219,20 @@ template void cwise_min_max(const MatrixType& m) VERIFY((numext::isnan)(m1.template cwiseMin(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); VERIFY(!(numext::isnan)(m1.template cwiseMax(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); VERIFY(!(numext::isnan)(m1.template cwiseMin(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY((numext::isnan)(m1.template cwiseMax(Scalar(1))(0,0))); + VERIFY((numext::isnan)(m1.template cwiseMin(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMax(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMin(Scalar(1))(0,0))); + VERIFY((numext::isnan)(m1.array().template max(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); VERIFY((numext::isnan)(m1.array().template min(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); VERIFY(!(numext::isnan)(m1.array().template max(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); VERIFY(!(numext::isnan)(m1.array().template min(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY((numext::isnan)(m1.array().template max(Scalar(1))(0,0))); + VERIFY((numext::isnan)(m1.array().template min(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.array().template max(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.array().template min(Scalar(1))(0,0))); // Reductions. VERIFY((numext::isnan)(m1.template maxCoeff())); -- GitLab From e7248b26a1ed53fa030c5c459f7ea095dfd276ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Thu, 19 May 2022 22:29:48 +0000 Subject: [PATCH 144/266] Prevent BDCSVD crash caused by index out of bounds. (cherry picked from commit 028ab12586ee1244755455107fcba66493b336d6) --- Eigen/src/SVD/BDCSVD.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index a76a8dd04..6dcb986c1 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -978,8 +978,8 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - -// if (singVals[k] == left) singVals[k] *= 1 + NumTraits::epsilon(); -// if (singVals[k] == right) singVals[k] *= 1 - NumTraits::epsilon(); + // if (singVals[k] == left) singVals[k] *= 1 + NumTraits::epsilon(); + // if (singVals[k] == right) singVals[k] *= 1 - NumTraits::epsilon(); } } @@ -1033,7 +1033,14 @@ void BDCSVD::perturbCol0 std::cout << " " << "j=" << j << "\n"; } #endif - Index j = i 0 ? perm(l-1) : i; #ifdef EIGEN_BDCSVD_SANITY_CHECKS if(!(dk!=Literal(0) || diag(i)!=Literal(0))) { -- GitLab From 770ed0794eac2928bbbf9c21b12da89cd065b05d Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sun, 19 Dec 2021 17:21:52 +0200 Subject: [PATCH 145/266] fix broken asserts (cherry picked from commit 5a0a165c095aa12e5a4f188c56543f33ea546239) --- Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index 7803fd817..5e632c4e2 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -160,13 +160,13 @@ class IncompleteCholesky : public SparseSolverBase Date: Wed, 19 Jan 2022 21:34:10 +0000 Subject: [PATCH 146/266] Silence some MSVC warnings (cherry picked from commit 81c928ba55af78333d7446a55e3a0a36a4992c0e) --- Eigen/src/Core/util/Memory.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 875318cdb..79a763ccd 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -294,7 +294,7 @@ template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T */ template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size) { - std::size_t i; + std::size_t i=0; EIGEN_TRY { for (i = 0; i < size; ++i) ::new (ptr + i) T; @@ -305,7 +305,6 @@ template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T * destruct_elements_of_array(ptr, i); EIGEN_THROW; } - return NULL; } /***************************************************************************** -- GitLab From 52e545324e9dc1db6d694ee4602b331cfd4efb26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 4 Feb 2022 19:01:07 +0000 Subject: [PATCH 147/266] Fix ODR violations. (cherry picked from commit cafeadffef2a7ba41f2da5cf34c38068d74499eb) --- Eigen/src/Core/arch/GPU/PacketMath.h | 3 --- unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h | 9 ++++----- unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h | 6 +----- unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h | 7 ++++--- unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 5 ----- 5 files changed, 9 insertions(+), 21 deletions(-) diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index 25c45fd35..bfc11efbc 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -121,7 +121,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const do // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation // of the functions, while the latter can only deal with one of them. #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) -namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) { @@ -180,8 +179,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull); } -} // namespace - template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand(const float4& a, const float4& b) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index 3aff7fa01..6ac575ef5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -192,7 +192,7 @@ struct TensorEvaluator const Device EIGEN_DEVICE_REF m_device; }; -namespace { +namespace internal { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) { return *address; @@ -219,8 +219,7 @@ T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess &address return *address; } #endif -} - +} // namespace internal // Default evaluator for rvalues template @@ -289,7 +288,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data != NULL); - return loadConstant(m_data+index); + return internal::loadConstant(m_data+index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -314,7 +313,7 @@ struct TensorEvaluator eigen_assert(m_data != NULL); const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); - return loadConstant(m_data+index); + return internal::loadConstant(m_data+index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 6d5cce4aa..74fdc4c3c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -28,8 +28,6 @@ namespace Eigen { namespace internal { -namespace { - // Note: result is undefined if val == 0 template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE @@ -135,8 +133,6 @@ namespace { #endif } }; -} - template struct TensorIntDivisor { @@ -252,7 +248,7 @@ private: template -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { return divisor.divide(numerator); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index b3f00f77a..57da2e18d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -369,8 +369,9 @@ class TensorSlicingOp : public TensorBase struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { @@ -400,7 +401,7 @@ template struct MemcpyTriggerForSlicing @@ -511,7 +512,7 @@ struct TensorEvaluator, Devi } } // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing trigger(m_device); + const internal::MemcpyTriggerForSlicing trigger(m_device); if (trigger(internal::array_prod(dimensions()), contiguous_values)) { EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 37c1d1c3d..e1d55ceaa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -14,8 +14,6 @@ namespace Eigen { namespace internal { -namespace { - EIGEN_DEVICE_FUNC uint64_t get_random_seed() { #if defined(EIGEN_GPU_COMPILE_PHASE) // We don't support 3d kernels since we currently only use 1 and @@ -43,9 +41,6 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; } -} // namespace - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) { unsigned rnd = PCG_XSH_RS_generator(state, stream); -- GitLab From 77b28073227df93b6e41d3f155170c8b5716efe3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 4 Feb 2022 19:35:18 +0000 Subject: [PATCH 148/266] Fix AVX512 math function consistency, enable for ICC. (cherry picked from commit 96da541cba007a84979ee5e3000c13eab982d56c) --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 7 +++++-- Eigen/src/Core/arch/AVX512/PacketMath.h | 24 +++++++++++----------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 6fd726d29..9522024a6 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -15,7 +15,8 @@ namespace Eigen { namespace internal { // Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. -#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 +#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900 +#define EIGEN_HAS_AVX512_MATH 1 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ const Packet16f p16f_##NAME = pset1(X) @@ -326,7 +327,9 @@ Packet16f pexpm1(const Packet16f& _x) { F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1) -#endif +#else +#define EIGEN_HAS_AVX512_MATH 0 +#endif // EIGEN_HAS_AVX512_MATH template <> diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 34d49ab66..8fb8e02b8 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -72,12 +72,14 @@ struct packet_traits : default_packet_traits { HasMax = 1, HasConj = 1, HasSetLinear = 0, - HasLog = 1, - HasLog1p = 1, - HasExpm1 = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, + HasLog = EIGEN_HAS_AVX512_MATH, + HasLog1p = EIGEN_HAS_AVX512_MATH, + HasExp = EIGEN_HAS_AVX512_MATH, + HasExpm1 = EIGEN_HAS_AVX512_MATH, + HasSqrt = EIGEN_HAS_AVX512_MATH, + HasRsqrt = EIGEN_HAS_AVX512_MATH, + HasBessel = EIGEN_HAS_AVX512_MATH, + HasNdtri = EIGEN_HAS_AVX512_MATH, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, @@ -86,9 +88,7 @@ struct packet_traits : default_packet_traits { HasRound = 1, HasFloor = 1, HasCeil = 1, - HasRint = 1, - HasBessel = 1, - HasNdtri = 1 + HasRint = 1 }; }; @@ -109,7 +109,7 @@ template<> struct packet_traits : default_packet_traits HasBlend = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH HasLog = 1, HasLog1p = 1, HasExpm1 = 1, @@ -138,7 +138,7 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH HasLog = 1, HasExp = 1, HasSqrt = EIGEN_FAST_MATH, @@ -1852,7 +1852,7 @@ struct packet_traits : default_packet_traits { HasInsert = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, // Currently fails test with bad accuracy. HasLog1p = 1, -- GitLab From 730a7812213cefdebf2f6b95716f69817842693e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 4 Feb 2022 22:25:52 +0000 Subject: [PATCH 149/266] Define EIGEN_HAS_AVX512_MATH in PacketMath. (cherry picked from commit e7f4a901ee8cbe42d37bcabefb342086235c3839) --- Eigen/src/Core/arch/AVX512/MathFunctions.h | 6 +----- Eigen/src/Core/arch/AVX512/PacketMath.h | 7 +++++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 9522024a6..017d6bff0 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -14,9 +14,7 @@ namespace Eigen { namespace internal { -// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. -#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900 -#define EIGEN_HAS_AVX512_MATH 1 +#if EIGEN_HAS_AVX512_MATH #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ const Packet16f p16f_##NAME = pset1(X) @@ -327,8 +325,6 @@ Packet16f pexpm1(const Packet16f& _x) { F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1) -#else -#define EIGEN_HAS_AVX512_MATH 0 #endif // EIGEN_HAS_AVX512_MATH diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 8fb8e02b8..96d85ff20 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -28,6 +28,13 @@ namespace internal { #endif #endif +// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics. +#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900 +#define EIGEN_HAS_AVX512_MATH 1 +#else +#define EIGEN_HAS_AVX512_MATH 0 +#endif + typedef __m512 Packet16f; typedef __m512i Packet16i; typedef __m512d Packet8d; -- GitLab From 995714142d37e380de52b062544ccd6982b4cc52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 4 Feb 2022 22:47:34 +0000 Subject: [PATCH 150/266] Restrict GCC<6.3 maxpd workaround to only gcc. (cherry picked from commit 4bffbe84f9125fc05bc781bf2ec87ada73ecf7f2) --- Eigen/src/Core/arch/SSE/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index db102c73a..9feca1ccc 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -513,7 +513,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, -- GitLab From 709d7048198fb622a576200c9c18e6b8205347fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Mon, 7 Feb 2022 18:17:42 +0000 Subject: [PATCH 151/266] Fix collision with resolve.h. (cherry picked from commit 94bed2b80c8ed7a4293a0d16afd11e617e447eeb) --- .../Core/products/TriangularMatrixMatrix.h | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index f0c60507a..ba605a1c2 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -18,10 +18,10 @@ namespace internal { // struct gemm_pack_lhs_triangular // { // Matrix::IsComplex && Conjugate> cj; -// const_blas_data_mapper lhs(_lhs,lhsStride); +// const_blas_data_mapper lhs(lhs_,lhsStride); // int count = 0; // const int peeled_mc = (rows/mr)*mr; // for(int i=0; i& blocking); }; @@ -110,9 +110,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros @@ -124,9 +124,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -254,8 +254,8 @@ struct product_triangular_matrix_matrix& blocking); }; @@ -268,9 +268,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { const Index PacketBytes = packet_traits::size*sizeof(Scalar); @@ -283,9 +283,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction -- GitLab From 21e0ad056e30591bee723846e95399a74182dac3 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 11 Feb 2022 23:19:56 -0800 Subject: [PATCH 152/266] Fix ODR failures in TensorRandom. (cherry picked from commit bded5028a5bd112181b94b2a246ac2c20e671c2f) --- unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index e1d55ceaa..2bcb39a95 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -14,7 +14,7 @@ namespace Eigen { namespace internal { -EIGEN_DEVICE_FUNC uint64_t get_random_seed() { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() { #if defined(EIGEN_GPU_COMPILE_PHASE) // We don't support 3d kernels since we currently only use 1 and // 2d kernels. @@ -27,7 +27,7 @@ EIGEN_DEVICE_FUNC uint64_t get_random_seed() { #endif } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { // TODO: Unify with the implementation in the non blocking thread pool. uint64_t current = *state; // Update the internal state @@ -36,7 +36,7 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint6 return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { seed = seed ? seed : get_random_seed(); return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; } -- GitLab From d1ed3fe5c9f70fd1ca2d024362af7ba23529cb16 Mon Sep 17 00:00:00 2001 From: Martin Heistermann Date: Fri, 18 Feb 2022 16:13:28 +0000 Subject: [PATCH 153/266] Fix for crash bug in SPQRSupport: Initialize pointers to nullptr to avoid free() calls of invalid pointers. (cherry picked from commit 550af3938cc46116435f3dd88a9cdc94837d7542) --- Eigen/src/SPQRSupport/SuiteSparseQRSupport.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h index 013c7ae7a..3ceac5ad3 100644 --- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h +++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h @@ -258,12 +258,12 @@ class SPQR : public SparseSolverBase > int m_ordering; // Ordering method to use, see SPQR's manual int m_allow_tol; // Allow to use some tolerance during numerical factorization. RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero - mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format + mutable cholmod_sparse *m_cR = nullptr; // The sparse R factor in cholmod format mutable MatrixType m_R; // The sparse matrix R in Eigen format - mutable StorageIndex *m_E; // The permutation applied to columns - mutable cholmod_sparse *m_H; //The householder vectors - mutable StorageIndex *m_HPinv; // The row permutation of H - mutable cholmod_dense *m_HTau; // The Householder coefficients + mutable StorageIndex *m_E = nullptr; // The permutation applied to columns + mutable cholmod_sparse *m_H = nullptr; //The householder vectors + mutable StorageIndex *m_HPinv = nullptr; // The row permutation of H + mutable cholmod_dense *m_HTau = nullptr; // The Householder coefficients mutable Index m_rank; // The rank of the matrix mutable cholmod_common m_cc; // Workspace and parameters bool m_useDefaultThreshold; // Use default threshold -- GitLab From 36be6747e0576afdd09553a4b342d89538c8f179 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 23 Feb 2022 16:37:03 +0000 Subject: [PATCH 154/266] Modify test expression to avoid numerical differences (#2402). (cherry picked from commit ae86a146b1ac9a49bf72e485254c08d237fd094a) --- unsupported/test/cxx11_tensor_block_eval.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp index b2e26ebb7..d66a63e0f 100644 --- a/unsupported/test/cxx11_tensor_block_eval.cpp +++ b/unsupported/test/cxx11_tensor_block_eval.cpp @@ -244,7 +244,7 @@ static void test_eval_tensor_binary_with_unary_expr_block() { rhs.setRandom(); VerifyBlockEvaluator( - (lhs.square() + rhs.square()).sqrt(), + (lhs.abs() + rhs.abs()).sqrt(), [&dims]() { return RandomBlock(dims, 1, 10); }); } -- GitLab From cd543434bf879bea93ea398d36160ed9a075375d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 23 Feb 2022 21:56:25 +0000 Subject: [PATCH 155/266] Fix gcc-5 packetmath_12 bug. (cherry picked from commit 897071977177ef0fd392861655ba1ad47546a265) --- test/packetmath.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 121ec7283..f29417b98 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -353,10 +353,10 @@ template void packetmath_minus_zero_add() { const int PacketSize = internal::unpacket_traits::size; const int size = 2 * PacketSize; - EIGEN_ALIGN_MAX Scalar data1[size]; - EIGEN_ALIGN_MAX Scalar data2[size]; - EIGEN_ALIGN_MAX Scalar ref[size]; - + EIGEN_ALIGN_MAX Scalar data1[size] = {}; + EIGEN_ALIGN_MAX Scalar data2[size] = {}; + EIGEN_ALIGN_MAX Scalar ref[size] = {}; + for (int i = 0; i < PacketSize; ++i) { data1[i] = Scalar(-0.0); data1[i + PacketSize] = Scalar(-0.0); -- GitLab From d259104c8de1e43619327f5d7f6a0dbb372f3f66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Thu, 24 Feb 2022 22:16:37 +0000 Subject: [PATCH 156/266] Fix frexp packetmath tests for MSVC. (cherry picked from commit 2ed4bee78fc44b37d5e5b9f532f5659f9c978499) --- test/packetmath.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/packetmath.cpp b/test/packetmath.cpp index f29417b98..23aa33fc2 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -52,6 +52,12 @@ inline T REF_FREXP(const T& x, T& exp) { EIGEN_USING_STD(frexp) const T out = static_cast(frexp(x, &iexp)); exp = static_cast(iexp); + + // The exponent value is unspecified if the input is inf or NaN, but MSVC + // seems to set it to 1. We need to set it back to zero for consistency. + if (!(numext::isfinite)(x)) { + exp = T(0); + } return out; } -- GitLab From e0fe00691518cf39a5f8fe561a02d7832c90127a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 25 Feb 2022 19:28:10 +0000 Subject: [PATCH 157/266] Fix mixingtypes for g++-11. (cherry picked from commit 19c39bea29e21041ceca481851b3a5c889b51d98) --- Eigen/src/Core/arch/AVX512/Complex.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 074253859..bdedb7b6c 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -255,11 +255,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) { - #ifdef EIGEN_VECTORIZE_AVX512DQ - return Packet4cd(_mm512_broadcast_f64x2(pset1(from).v)); - #else return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1(from).v)))); - #endif } template<> EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { -- GitLab From bd0d873b16854a805b4fc4f041ac8e0b018befc8 Mon Sep 17 00:00:00 2001 From: Yury Gitman Date: Tue, 1 Mar 2022 05:27:50 +0000 Subject: [PATCH 158/266] Fix any/all reduction in the case of row-major layout (cherry picked from commit bf6726a0c6cb5b9400c54568308cdc094a53bfc0) --- Eigen/src/Core/BooleanRedux.h | 62 +++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h index 852de8b90..15e9f3f8d 100644 --- a/Eigen/src/Core/BooleanRedux.h +++ b/Eigen/src/Core/BooleanRedux.h @@ -14,54 +14,56 @@ namespace Eigen { namespace internal { -template +template struct all_unroller { enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + IsRowMajor = (Derived::Flags & RowMajor), + i = (UnrollCount-1) / InnerSize, + j = (UnrollCount-1) % InnerSize }; EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return all_unroller::run(mat) && mat.coeff(row, col); + return all_unroller::run(mat) && mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i); } }; -template -struct all_unroller +template +struct all_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; } }; -template -struct all_unroller +template +struct all_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; -template +template struct any_unroller { enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + IsRowMajor = (Derived::Flags & RowMajor), + i = (UnrollCount-1) / InnerSize, + j = (UnrollCount-1) % InnerSize }; - + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return any_unroller::run(mat) || mat.coeff(row, col); + return any_unroller::run(mat) || mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i); } }; -template -struct any_unroller +template +struct any_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; } }; -template -struct any_unroller +template +struct any_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; @@ -81,16 +83,18 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::all() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT, + IsRowMajor = (internal::traits::Flags & RowMajor), + InnerSizeAtCompileTime = IsRowMajor ? internal::traits::ColsAtCompileTime : internal::traits::RowsAtCompileTime }; Evaluator evaluator(derived()); if(unroll) - return internal::all_unroller::RowsAtCompileTime>::run(evaluator); + return internal::all_unroller::run(evaluator); else { - for(Index j = 0; j < cols(); ++j) - for(Index i = 0; i < rows(); ++i) - if (!evaluator.coeff(i, j)) return false; + for(Index i = 0; i < derived().outerSize(); ++i) + for(Index j = 0; j < derived().innerSize(); ++j) + if (!evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return false; return true; } } @@ -105,16 +109,18 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::any() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT, + IsRowMajor = (internal::traits::Flags & RowMajor), + InnerSizeAtCompileTime = IsRowMajor ? internal::traits::ColsAtCompileTime : internal::traits::RowsAtCompileTime }; Evaluator evaluator(derived()); if(unroll) - return internal::any_unroller::RowsAtCompileTime>::run(evaluator); + return internal::any_unroller::run(evaluator); else { - for(Index j = 0; j < cols(); ++j) - for(Index i = 0; i < rows(); ++i) - if (evaluator.coeff(i, j)) return true; + for(Index i = 0; i < derived().outerSize(); ++i) + for(Index j = 0; j < derived().innerSize(); ++j) + if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return true; return false; } } @@ -156,7 +162,7 @@ inline bool DenseBase::allFinite() const return !((derived()-derived()).hasNaN()); #endif } - + } // end namespace Eigen #endif // EIGEN_ALLANDANY_H -- GitLab From bc1b354b32feac4a6f9cc0a1bf5b4baa6e8d6e79 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 1 Mar 2022 23:31:15 +0000 Subject: [PATCH 159/266] Adjust tolerance of matrix_power test for MSVC. (cherry picked from commit 1c2690ed248327539f7a248ddb12e1690da81b68) --- unsupported/test/matrix_power.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index dbaf9dbdf..ab1a030b8 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -171,7 +171,7 @@ EIGEN_DECLARE_TEST(matrix_power) CALL_SUBTEST_5(testGeneral(Matrix3cf(), 1e-4f)); CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4f)); CALL_SUBTEST_6(testGeneral(MatrixXf(2,2), 1e-3f)); // see bug 614 - CALL_SUBTEST_9(testGeneral(MatrixXe(7,7), 1e-13L)); + CALL_SUBTEST_9(testGeneral(MatrixXe(7,7), 1e-12L)); CALL_SUBTEST_10(testGeneral(Matrix3d(), 1e-13)); CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testGeneral(Matrix3e(), 1e-13L)); @@ -184,7 +184,7 @@ EIGEN_DECLARE_TEST(matrix_power) CALL_SUBTEST_5(testSingular(Matrix3cf(), 1e-4f)); CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4f)); CALL_SUBTEST_6(testSingular(MatrixXf(2,2), 1e-3f)); - CALL_SUBTEST_9(testSingular(MatrixXe(7,7), 1e-13L)); + CALL_SUBTEST_9(testSingular(MatrixXe(7,7), 1e-12L)); CALL_SUBTEST_10(testSingular(Matrix3d(), 1e-13)); CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testSingular(Matrix3e(), 1e-13L)); @@ -197,7 +197,7 @@ EIGEN_DECLARE_TEST(matrix_power) CALL_SUBTEST_5(testLogThenExp(Matrix3cf(), 1e-4f)); CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4f)); CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2), 1e-3f)); - CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7), 1e-13L)); + CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7), 1e-12L)); CALL_SUBTEST_10(testLogThenExp(Matrix3d(), 1e-13)); CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testLogThenExp(Matrix3e(), 1e-13L)); -- GitLab From b30a2a527ef2c10f9650d3e4801027c76d9e9351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 2 Mar 2022 19:31:20 +0000 Subject: [PATCH 160/266] Remove poor non-convergence checks in NonLinearOptimization. (cherry picked from commit d819a33bf64c4fce95c55f8e44a68b486f064a79) --- unsupported/test/NonLinearOptimization.cpp | 136 +++++++----- unsupported/test/levenberg_marquardt.cpp | 239 ++++++++++++--------- 2 files changed, 220 insertions(+), 155 deletions(-) diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp index c667b7247..b6c29ca4d 100644 --- a/unsupported/test/NonLinearOptimization.cpp +++ b/unsupported/test/NonLinearOptimization.cpp @@ -12,14 +12,10 @@ // It is intended to be done for this test only. #include -// tolerance for chekcing number of iterations -#define LM_EVAL_COUNT_TOL 4/3 +// tolerance for checking number of iterations +#define LM_EVAL_COUNT_TOL 2 #define LM_CHECK_N_ITERS(SOLVER,NFEV,NJEV) { \ - ++g_test_level; \ - VERIFY_IS_EQUAL(SOLVER.nfev, NFEV); \ - VERIFY_IS_EQUAL(SOLVER.njev, NJEV); \ - --g_test_level; \ VERIFY(SOLVER.nfev <= NFEV * LM_EVAL_COUNT_TOL); \ VERIFY(SOLVER.njev <= NJEV * LM_EVAL_COUNT_TOL); \ } @@ -186,9 +182,10 @@ void testLmder1() lmder_functor functor; LevenbergMarquardt lm(functor); info = lm.lmder1(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm @@ -214,9 +211,10 @@ void testLmder() lmder_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm @@ -298,9 +296,10 @@ void testHybrj1() hybrj_functor functor; HybridNonLinearSolver solver(functor); info = solver.hybrj1(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(solver, 11, 1); // check norm @@ -332,9 +331,10 @@ void testHybrj() solver.diag.setConstant(n, 1.); solver.useExternalScaling = true; info = solver.solve(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(solver, 11, 1); // check norm @@ -385,10 +385,11 @@ void testHybrd1() hybrd_functor functor; HybridNonLinearSolver solver(functor); info = solver.hybrd1(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(solver.nfev, 20); + // VERIFY_IS_EQUAL(info, 1); + VERIFY(solver.nfev <= 20*LM_EVAL_COUNT_TOL); // check norm VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08); @@ -416,10 +417,11 @@ void testHybrd() solver.diag.setConstant(n, 1.); solver.useExternalScaling = true; info = solver.solveNumericalDiff(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(solver.nfev, 14); + // VERIFY_IS_EQUAL(info, 1); + VERIFY(solver.nfev <= 14*LM_EVAL_COUNT_TOL); // check norm VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08); @@ -487,9 +489,10 @@ void testLmstr1() lmstr_functor functor; LevenbergMarquardt lm(functor); info = lm.lmstr1(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm @@ -515,9 +518,10 @@ void testLmstr() lmstr_functor functor; LevenbergMarquardt lm(functor); info = lm.minimizeOptimumStorage(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm @@ -570,10 +574,11 @@ void testLmdif1() lmdif_functor functor; DenseIndex nfev = -1; // initialize to avoid maybe-uninitialized warning info = LevenbergMarquardt::lmdif1(functor, x, &nfev); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(nfev, 26); + // VERIFY_IS_EQUAL(info, 1); + VERIFY( nfev <= 26*LM_EVAL_COUNT_TOL); // check norm functor(x, fvec); @@ -601,10 +606,11 @@ void testLmdif() NumericalDiff numDiff(functor); LevenbergMarquardt > lm(numDiff); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 26); + // VERIFY_IS_EQUAL(info, 1); + VERIFY(lm.nfev <= 26*LM_EVAL_COUNT_TOL); // check norm fnorm = lm.fvec.blueNorm(); @@ -686,9 +692,10 @@ void testNistChwirut2(void) chwirut2_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 10, 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02); @@ -706,9 +713,10 @@ void testNistChwirut2(void) lm.parameters.ftol = 1.E6*NumTraits::epsilon(); lm.parameters.xtol = 1.E6*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 7, 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02); @@ -764,9 +772,10 @@ void testNistMisra1a(void) misra1a_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 19, 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01); @@ -780,9 +789,10 @@ void testNistMisra1a(void) x<< 250., 0.0005; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 5, 4); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01); @@ -852,9 +862,10 @@ void testNistHahn1(void) hahn1_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 11, 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00); @@ -873,9 +884,10 @@ void testNistHahn1(void) x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 11, 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00); @@ -936,9 +948,10 @@ void testNistMisra1d(void) misra1d_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 3); + // VERIFY_IS_EQUAL(info, 3); LM_CHECK_N_ITERS(lm, 9, 7); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02); @@ -952,9 +965,10 @@ void testNistMisra1d(void) x<< 450., 0.0003; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 4, 3); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02); @@ -1012,13 +1026,14 @@ void testNistLanczos1(void) lanczos1_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 2); + // VERIFY_IS_EQUAL(info, 2); LM_CHECK_N_ITERS(lm, 79, 72); // check norm^2 - std::cout.precision(30); - std::cout << lm.fvec.squaredNorm() << "\n"; + // std::cout.precision(30); + // std::cout << lm.fvec.squaredNorm() << "\n"; VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); @@ -1034,9 +1049,10 @@ void testNistLanczos1(void) x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 2); + // VERIFY_IS_EQUAL(info, 2); LM_CHECK_N_ITERS(lm, 9, 8); // check norm^2 VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25); @@ -1098,9 +1114,10 @@ void testNistRat42(void) rat42_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 10, 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00); @@ -1115,9 +1132,10 @@ void testNistRat42(void) x<< 75., 2.5, 0.07; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00); @@ -1174,9 +1192,10 @@ void testNistMGH10(void) MGH10_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 2); + // VERIFY_IS_EQUAL(info, 2); LM_CHECK_N_ITERS(lm, 284, 249); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01); @@ -1191,9 +1210,10 @@ void testNistMGH10(void) x<< 0.02, 4000., 250.; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 3); + // VERIFY_IS_EQUAL(info, 3); LM_CHECK_N_ITERS(lm, 126, 116); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01); @@ -1251,9 +1271,10 @@ void testNistBoxBOD(void) lm.parameters.xtol = 1.E6*NumTraits::epsilon(); lm.parameters.factor = 10.; info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 31, 25); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03); @@ -1270,10 +1291,11 @@ void testNistBoxBOD(void) lm.parameters.ftol = NumTraits::epsilon(); lm.parameters.xtol = NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - LM_CHECK_N_ITERS(lm, 15, 14); + // VERIFY_IS_EQUAL(info, 1); + LM_CHECK_N_ITERS(lm, 20, 14); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03); // check x @@ -1331,6 +1353,7 @@ void testNistMGH17(void) lm.parameters.xtol = NumTraits::epsilon(); lm.parameters.maxfev = 1000; info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05); @@ -1342,7 +1365,7 @@ void testNistMGH17(void) VERIFY_IS_APPROX(x[4], 2.2122699662E-02); // check return value - VERIFY_IS_EQUAL(info, 2); + // VERIFY_IS_EQUAL(info, 2); LM_CHECK_N_ITERS(lm, 602, 545); /* @@ -1352,9 +1375,10 @@ void testNistMGH17(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 18, 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05); @@ -1417,9 +1441,10 @@ void testNistMGH09(void) LevenbergMarquardt lm(functor); lm.parameters.maxfev = 1000; info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 490, 376); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04); @@ -1436,9 +1461,10 @@ void testNistMGH09(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 18, 16); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04); @@ -1501,9 +1527,10 @@ void testNistBennett5(void) LevenbergMarquardt lm(functor); lm.parameters.maxfev = 1000; info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 758, 744); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04); @@ -1518,9 +1545,10 @@ void testNistBennett5(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 203, 192); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04); @@ -1587,9 +1615,10 @@ void testNistThurber(void) lm.parameters.ftol = 1.E4*NumTraits::epsilon(); lm.parameters.xtol = 1.E4*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 39,36); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03); @@ -1611,9 +1640,10 @@ void testNistThurber(void) lm.parameters.ftol = 1.E4*NumTraits::epsilon(); lm.parameters.xtol = 1.E4*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 29, 28); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03); @@ -1677,9 +1707,10 @@ void testNistRat43(void) lm.parameters.ftol = 1.E6*NumTraits::epsilon(); lm.parameters.xtol = 1.E6*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 27, 20); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03); @@ -1698,9 +1729,10 @@ void testNistRat43(void) lm.parameters.ftol = 1.E5*NumTraits::epsilon(); lm.parameters.xtol = 1.E5*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 9, 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03); @@ -1760,9 +1792,10 @@ void testNistEckerle4(void) eckerle4_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 18, 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03); @@ -1777,9 +1810,10 @@ void testNistEckerle4(void) x<< 1.5, 5., 450.; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 7, 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03); diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp index 7f9a81cd3..d0748d13a 100644 --- a/unsupported/test/levenberg_marquardt.cpp +++ b/unsupported/test/levenberg_marquardt.cpp @@ -24,7 +24,7 @@ using std::sqrt; // tolerance for chekcing number of iterations -#define LM_EVAL_COUNT_TOL 4/3 +#define LM_EVAL_COUNT_TOL 2 struct lmder_functor : DenseFunctor { @@ -75,11 +75,11 @@ void testLmder1() lmder_functor functor; LevenbergMarquardt lm(functor); info = lm.lmder1(x); - + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 6); - VERIFY_IS_EQUAL(lm.njev(), 5); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 6); + // VERIFY_IS_EQUAL(lm.njev(), 5); // check norm VERIFY_IS_APPROX(lm.fvec().blueNorm(), 0.09063596); @@ -104,11 +104,12 @@ void testLmder() lmder_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 6); - VERIFY_IS_EQUAL(lm.njev(), 5); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 6); + // VERIFY_IS_EQUAL(lm.njev(), 5); // check norm fnorm = lm.fvec().blueNorm(); @@ -177,9 +178,10 @@ void testLmdif1() lmdif_functor functor; DenseIndex nfev; info = LevenbergMarquardt::lmdif1(functor, x, &nfev); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(nfev, 26); // check norm @@ -208,9 +210,10 @@ void testLmdif() NumericalDiff numDiff(functor); LevenbergMarquardt > lm(numDiff); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(lm.nfev(), 26); // check norm @@ -293,11 +296,12 @@ void testNistChwirut2(void) chwirut2_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(lm.nfev(), 10); - VERIFY_IS_EQUAL(lm.njev(), 8); + // VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02); // check x @@ -314,11 +318,12 @@ void testNistChwirut2(void) lm.setFtol(1.E6*NumTraits::epsilon()); lm.setXtol(1.E6*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(lm.nfev(), 7); - VERIFY_IS_EQUAL(lm.njev(), 6); + // VERIFY_IS_EQUAL(lm.njev(), 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02); // check x @@ -373,11 +378,12 @@ void testNistMisra1a(void) misra1a_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 19); - VERIFY_IS_EQUAL(lm.njev(), 15); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 19); + // VERIFY_IS_EQUAL(lm.njev(), 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01); // check x @@ -390,11 +396,12 @@ void testNistMisra1a(void) x<< 250., 0.0005; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 5); - VERIFY_IS_EQUAL(lm.njev(), 4); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 5); + // VERIFY_IS_EQUAL(lm.njev(), 4); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01); // check x @@ -464,11 +471,12 @@ void testNistHahn1(void) hahn1_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 11); - VERIFY_IS_EQUAL(lm.njev(), 10); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 11); + // VERIFY_IS_EQUAL(lm.njev(), 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00); // check x @@ -486,11 +494,12 @@ void testNistHahn1(void) x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(lm.nfev(), 11); - VERIFY_IS_EQUAL(lm.njev(), 10); + // VERIFY_IS_EQUAL(lm.njev(), 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00); // check x @@ -550,11 +559,12 @@ void testNistMisra1d(void) misra1d_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 9); - VERIFY_IS_EQUAL(lm.njev(), 7); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 9); + // VERIFY_IS_EQUAL(lm.njev(), 7); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02); // check x @@ -567,11 +577,12 @@ void testNistMisra1d(void) x<< 450., 0.0003; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 4); - VERIFY_IS_EQUAL(lm.njev(), 3); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 4); + // VERIFY_IS_EQUAL(lm.njev(), 3); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02); // check x @@ -628,11 +639,12 @@ void testNistLanczos1(void) lanczos1_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); - VERIFY_IS_EQUAL(lm.nfev(), 79); - VERIFY_IS_EQUAL(lm.njev(), 72); + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); + // VERIFY_IS_EQUAL(lm.nfev(), 79); + // VERIFY_IS_EQUAL(lm.njev(), 72); // check norm^2 VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25); // check x @@ -649,11 +661,12 @@ void testNistLanczos1(void) x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); - VERIFY_IS_EQUAL(lm.nfev(), 9); - VERIFY_IS_EQUAL(lm.njev(), 8); + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); + // VERIFY_IS_EQUAL(lm.nfev(), 9); + // VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25); // check x @@ -714,11 +727,12 @@ void testNistRat42(void) rat42_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); - VERIFY_IS_EQUAL(lm.nfev(), 10); - VERIFY_IS_EQUAL(lm.njev(), 8); + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // VERIFY_IS_EQUAL(lm.nfev(), 10); + // VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00); // check x @@ -732,11 +746,12 @@ void testNistRat42(void) x<< 75., 2.5, 0.07; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); - VERIFY_IS_EQUAL(lm.nfev(), 6); - VERIFY_IS_EQUAL(lm.njev(), 5); + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // VERIFY_IS_EQUAL(lm.nfev(), 6); + // VERIFY_IS_EQUAL(lm.njev(), 5); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00); // check x @@ -787,14 +802,15 @@ void testNistMGH10(void) /* * First try */ - x<< 2., 400000., 25000.; + x << 2., 400000., 25000.; // do the computation MGH10_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); - ++g_test_level; - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); - --g_test_level; + EIGEN_UNUSED_VARIABLE(info) + // ++g_test_level; + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // --g_test_level; // was: VERIFY_IS_EQUAL(info, 1); // check norm^2 @@ -805,11 +821,11 @@ void testNistMGH10(void) VERIFY_IS_APPROX(x[2], 3.4522363462E+02); // check return value - - ++g_test_level; - VERIFY_IS_EQUAL(lm.nfev(), 284 ); - VERIFY_IS_EQUAL(lm.njev(), 249 ); - --g_test_level; + + // ++g_test_level; + // VERIFY_IS_EQUAL(lm.nfev(), 284 ); + // VERIFY_IS_EQUAL(lm.njev(), 249 ); + // --g_test_level; VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL); VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL); @@ -819,11 +835,12 @@ void testNistMGH10(void) x<< 0.02, 4000., 250.; // do the computation info = lm.minimize(x); - ++g_test_level; - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); - // was: VERIFY_IS_EQUAL(info, 1); - --g_test_level; - + EIGEN_UNUSED_VARIABLE(info) + // ++g_test_level; + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // // was: VERIFY_IS_EQUAL(info, 1); + // --g_test_level; + // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01); // check x @@ -832,10 +849,10 @@ void testNistMGH10(void) VERIFY_IS_APPROX(x[2], 3.4522363462E+02); // check return value - ++g_test_level; - VERIFY_IS_EQUAL(lm.nfev(), 126); - VERIFY_IS_EQUAL(lm.njev(), 116); - --g_test_level; + // ++g_test_level; + // VERIFY_IS_EQUAL(lm.nfev(), 126); + // VERIFY_IS_EQUAL(lm.njev(), 116); + // --g_test_level; VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL); VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL); } @@ -888,6 +905,7 @@ void testNistBoxBOD(void) lm.setXtol(1.E6*NumTraits::epsilon()); lm.setFactor(10); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03); @@ -896,9 +914,9 @@ void testNistBoxBOD(void) VERIFY_IS_APPROX(x[1], 5.4723748542E-01); // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY(lm.nfev() < 31); // 31 - VERIFY(lm.njev() < 25); // 25 + // VERIFY_IS_EQUAL(info, 1); + // VERIFY(lm.nfev() < 31); // 31 + // VERIFY(lm.njev() < 25); // 25 /* * Second try @@ -909,13 +927,14 @@ void testNistBoxBOD(void) lm.setFtol(NumTraits::epsilon()); lm.setXtol( NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - ++g_test_level; - VERIFY_IS_EQUAL(lm.nfev(), 16 ); - VERIFY_IS_EQUAL(lm.njev(), 15 ); - --g_test_level; + // VERIFY_IS_EQUAL(info, 1); + // ++g_test_level; + // VERIFY_IS_EQUAL(lm.nfev(), 16 ); + // VERIFY_IS_EQUAL(lm.njev(), 15 ); + // --g_test_level; VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL); VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL); // check norm^2 @@ -975,6 +994,7 @@ void testNistMGH17(void) lm.setXtol(NumTraits::epsilon()); lm.setMaxfev(1000); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05); @@ -987,8 +1007,8 @@ void testNistMGH17(void) // check return value // VERIFY_IS_EQUAL(info, 2); //FIXME Use (lm.info() == Success) - VERIFY(lm.nfev() < 700 ); // 602 - VERIFY(lm.njev() < 600 ); // 545 + // VERIFY(lm.nfev() < 700 ); // 602 + // VERIFY(lm.njev() < 600 ); // 545 /* * Second try @@ -997,11 +1017,12 @@ void testNistMGH17(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 18); - VERIFY_IS_EQUAL(lm.njev(), 15); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 18); + // VERIFY_IS_EQUAL(lm.njev(), 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05); // check x @@ -1063,6 +1084,7 @@ void testNistMGH09(void) LevenbergMarquardt lm(functor); lm.setMaxfev(1000); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04); @@ -1072,9 +1094,9 @@ void testNistMGH09(void) VERIFY_IS_APPROX(x[2], 0.12305309914); // should be 1.2305650693E-01 VERIFY_IS_APPROX(x[3], 0.13605395375); // should be 1.3606233068E-01 // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY(lm.nfev() < 510 ); // 490 - VERIFY(lm.njev() < 400 ); // 376 + // VERIFY_IS_EQUAL(info, 1); + // VERIFY(lm.nfev() < 510 ); // 490 + // VERIFY(lm.njev() < 400 ); // 376 /* * Second try @@ -1083,11 +1105,12 @@ void testNistMGH09(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 18); - VERIFY_IS_EQUAL(lm.njev(), 16); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 18); + // VERIFY_IS_EQUAL(lm.njev(), 16); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04); // check x @@ -1149,11 +1172,12 @@ void testNistBennett5(void) LevenbergMarquardt lm(functor); lm.setMaxfev(1000); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 758); - VERIFY_IS_EQUAL(lm.njev(), 744); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 758); + // VERIFY_IS_EQUAL(lm.njev(), 744); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04); // check x @@ -1167,11 +1191,12 @@ void testNistBennett5(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 203); - VERIFY_IS_EQUAL(lm.njev(), 192); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 203); + // VERIFY_IS_EQUAL(lm.njev(), 192); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04); // check x @@ -1237,11 +1262,12 @@ void testNistThurber(void) lm.setFtol(1.E4*NumTraits::epsilon()); lm.setXtol(1.E4*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 39); - VERIFY_IS_EQUAL(lm.njev(), 36); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 39); + // VERIFY_IS_EQUAL(lm.njev(), 36); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03); // check x @@ -1262,11 +1288,12 @@ void testNistThurber(void) lm.setFtol(1.E4*NumTraits::epsilon()); lm.setXtol(1.E4*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 29); - VERIFY_IS_EQUAL(lm.njev(), 28); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 29); + // VERIFY_IS_EQUAL(lm.njev(), 28); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03); // check x @@ -1329,11 +1356,12 @@ void testNistRat43(void) lm.setFtol(1.E6*NumTraits::epsilon()); lm.setXtol(1.E6*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 27); - VERIFY_IS_EQUAL(lm.njev(), 20); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 27); + // VERIFY_IS_EQUAL(lm.njev(), 20); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03); // check x @@ -1351,11 +1379,12 @@ void testNistRat43(void) lm.setFtol(1.E5*NumTraits::epsilon()); lm.setXtol(1.E5*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 9); - VERIFY_IS_EQUAL(lm.njev(), 8); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 9); + // VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03); // check x @@ -1414,11 +1443,12 @@ void testNistEckerle4(void) eckerle4_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 18); - VERIFY_IS_EQUAL(lm.njev(), 15); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 18); + // VERIFY_IS_EQUAL(lm.njev(), 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03); // check x @@ -1432,11 +1462,12 @@ void testNistEckerle4(void) x<< 1.5, 5., 450.; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 7); - VERIFY_IS_EQUAL(lm.njev(), 6); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 7); + // VERIFY_IS_EQUAL(lm.njev(), 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03); // check x -- GitLab From f6954e4485cdeaf10cf380ab29feeb95c492eb18 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 2 Mar 2022 11:18:17 -0800 Subject: [PATCH 161/266] Fix enum conversion warnings in BooleanRedux. (cherry picked from commit 55c7400db5f43c60ae8f9b475a296e897c05ddb3) --- Eigen/src/Core/BooleanRedux.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h index 15e9f3f8d..d555c86f7 100644 --- a/Eigen/src/Core/BooleanRedux.h +++ b/Eigen/src/Core/BooleanRedux.h @@ -18,7 +18,7 @@ template struct all_unroller { enum { - IsRowMajor = (Derived::Flags & RowMajor), + IsRowMajor = (int(Derived::Flags) & int(RowMajor)), i = (UnrollCount-1) / InnerSize, j = (UnrollCount-1) % InnerSize }; @@ -45,7 +45,7 @@ template struct any_unroller { enum { - IsRowMajor = (Derived::Flags & RowMajor), + IsRowMajor = (int(Derived::Flags) & int(RowMajor)), i = (UnrollCount-1) / InnerSize, j = (UnrollCount-1) % InnerSize }; @@ -84,7 +84,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::all() const enum { unroll = SizeAtCompileTime != Dynamic && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT, - IsRowMajor = (internal::traits::Flags & RowMajor), + IsRowMajor = (int(internal::traits::Flags) & int(RowMajor)), InnerSizeAtCompileTime = IsRowMajor ? internal::traits::ColsAtCompileTime : internal::traits::RowsAtCompileTime }; Evaluator evaluator(derived()); @@ -110,7 +110,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::any() const enum { unroll = SizeAtCompileTime != Dynamic && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT, - IsRowMajor = (internal::traits::Flags & RowMajor), + IsRowMajor = (int(internal::traits::Flags) & int(RowMajor)), InnerSizeAtCompileTime = IsRowMajor ? internal::traits::ColsAtCompileTime : internal::traits::RowsAtCompileTime }; Evaluator evaluator(derived()); -- GitLab From b1f06aac610005a8ce7fcc6b1a78c46070d3e031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Thu, 3 Mar 2022 19:54:15 +0000 Subject: [PATCH 162/266] Update vectorization_logic tests for all platforms. (cherry picked from commit 27d8f29be38a7ef9a1d3bfd82f0a8dbf16ac0a7c) --- test/vectorization_logic.cpp | 75 +++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 97c0bdad9..f5c86afd3 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -39,11 +39,15 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling) { EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src); typedef internal::copy_using_evaluator_traits,internal::evaluator, internal::assign_op > traits; - bool res = traits::Traversal==traversal; - if(unrolling==InnerUnrolling+CompleteUnrolling) - res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling); - else - res = res && int(traits::Unrolling)==unrolling; + // If traversal or unrolling are negative, ignore. + bool res = traversal > -1 ? traits::Traversal==traversal : true; + if (unrolling > -1) { + if(unrolling==InnerUnrolling+CompleteUnrolling) { + res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling); + } else { + res = res && int(traits::Unrolling)==unrolling; + } + } if(!res) { std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl; @@ -178,21 +182,15 @@ struct vectorization_logic typedef Matrix Vector3; VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1), LinearTraversal,CompleteUnrolling)); - VERIFY(test_assign(Vector3(),Vector3()+Vector3(), - sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal), CompleteUnrolling)); - VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), - EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal) - : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal), - ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling)); + // Vectorization depends on too many factors - ignore. + VERIFY(test_assign(Vector3(),Vector3()+Vector3(), -1, CompleteUnrolling)); VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()), LinearVectorizedTraversal,CompleteUnrolling)); + // Vectorization depends on too many factors - ignore. VERIFY(test_assign(Matrix(),Matrix()+Matrix(), - sizeof(Scalar)==16 ? InnerVectorizedTraversal : - EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : - LinearTraversal, - NoUnrolling)); + -1, NoUnrolling)); VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling)); @@ -277,12 +275,20 @@ struct vectorization_logic_half }; static void run() { + // Some half-packets have a byte size < EIGEN_MIN_ALIGN_BYTES (e.g. Packet2f), + // which causes many of these tests to fail since they don't vectorize if + // EIGEN_UNALIGNED_VECTORIZE is 0 (the matrix is assumed unaligned). + // Adjust the matrix sizes to account for these alignment issues. + constexpr int PacketBytes = sizeof(Scalar)*PacketSize; + constexpr int MinVSize = EIGEN_UNALIGNED_VECTORIZE ? PacketSize + : PacketBytes >= EIGEN_MIN_ALIGN_BYTES ? PacketSize + : (EIGEN_MIN_ALIGN_BYTES + sizeof(Scalar) - 1) / sizeof(Scalar); - typedef Matrix Vector1; - typedef Matrix Matrix11; - typedef Matrix Matrix57; - typedef Matrix Matrix35; - typedef Matrix Matrix57u; + typedef Matrix Vector1; + typedef Matrix Matrix11; + typedef Matrix Matrix57; + typedef Matrix Matrix35; + typedef Matrix Matrix57u; typedef Matrix Matrix3; - #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT +#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT VERIFY(test_assign(Vector1(),Vector1(), InnerVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Vector1(),Vector1()+Vector1(), InnerVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_assign(Vector1(),Vector1().template segment(0).derived(), + VERIFY(test_assign(Vector1(),Vector1().template segment(0).derived(), EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Vector1(),Scalar(2.1)*Vector1()-Vector1(), InnerVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_assign(Vector1(),(Scalar(2.1)*Vector1().template segment(0)-Vector1().template segment(0)).derived(), + VERIFY(test_assign(Vector1(),(Scalar(2.1)*Vector1().template segment(0)-Vector1().template segment(0)).derived(), EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()), InnerVectorizedTraversal,CompleteUnrolling)); @@ -331,19 +337,16 @@ struct vectorization_logic_half typedef Matrix Matrix33c; VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1), LinearTraversal,CompleteUnrolling)); - VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), - EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal) - : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal), - ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling)); - + + // Unrolling depends on read costs and unroll limits, which vary - ignore. VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()), - PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling)); + PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal, -1)); VERIFY(test_assign(Matrix(),Matrix()+Matrix(), sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal), NoUnrolling)); - VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), + VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,InnerUnrolling+CompleteUnrolling)); @@ -357,7 +360,7 @@ struct vectorization_logic_half VERIFY(test_redux(Vector1(), LinearVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_redux(Matrix(), + VERIFY(test_redux(Matrix(), LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_redux(Matrix3(), @@ -379,9 +382,9 @@ struct vectorization_logic_half Matrix >(DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling))); - VERIFY((test_assign(Matrix57(), Matrix()*Matrix(), - InnerVectorizedTraversal, InnerUnrolling+CompleteUnrolling))); - #endif + VERIFY((test_assign(Matrix57(), Matrix() * Matrix(), + InnerVectorizedTraversal, InnerUnrolling + CompleteUnrolling))); +#endif } }; -- GitLab From b6d9b6f48d9a9bd96f8237ef283dab341b575e1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 4 Mar 2022 21:22:02 +0000 Subject: [PATCH 163/266] Remove duplicate IsRowMajor declaration. (cherry picked from commit 0ae94456a0e6dd5e20ca65ba2f405964f6931faf) --- Eigen/src/Core/BooleanRedux.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h index d555c86f7..fa4d7c331 100644 --- a/Eigen/src/Core/BooleanRedux.h +++ b/Eigen/src/Core/BooleanRedux.h @@ -83,9 +83,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::all() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT, - IsRowMajor = (int(internal::traits::Flags) & int(RowMajor)), - InnerSizeAtCompileTime = IsRowMajor ? internal::traits::ColsAtCompileTime : internal::traits::RowsAtCompileTime + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) @@ -109,9 +107,7 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::any() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT, - IsRowMajor = (int(internal::traits::Flags) & int(RowMajor)), - InnerSizeAtCompileTime = IsRowMajor ? internal::traits::ColsAtCompileTime : internal::traits::RowsAtCompileTime + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) -- GitLab From b158fcaa7428c8f03c1792dc29723246a123a894 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 8 Mar 2022 21:21:20 +0000 Subject: [PATCH 164/266] Fix edge-case in zeta for large inputs. (cherry picked from commit 9296bb4b933973365d19b4b71e7d2b205d00a1ad) --- .../Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h | 10 +++++++++- unsupported/test/special_functions.cpp | 8 ++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index f1c260e29..bd2ac8308 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -1387,7 +1387,7 @@ struct zeta_impl { }; const Scalar maxnum = NumTraits::infinity(); - const Scalar zero = 0.0, half = 0.5, one = 1.0; + const Scalar zero = Scalar(0.0), half = Scalar(0.5), one = Scalar(1.0); const Scalar machep = cephes_helper::machep(); const Scalar nan = NumTraits::quiet_NaN(); @@ -1429,11 +1429,19 @@ struct zeta_impl { return s; } + // If b is zero, then the tail sum will also end up being zero. + // Exiting early here can prevent NaNs for some large inputs, where + // the tail sum computed below has term `a` which can overflow to `inf`. + if (numext::equal_strict(b, zero)) { + return s; + } + w = a; s += b*w/(x-one); s -= half * b; a = one; k = zero; + for( i=0; i<12; i++ ) { a *= x + k; diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp index 589bb76e1..756f031c2 100644 --- a/unsupported/test/special_functions.cpp +++ b/unsupported/test/special_functions.cpp @@ -191,10 +191,10 @@ template void array_special_functions() // Check the zeta function against scipy.special.zeta { - ArrayType x(10), q(10), res(10), ref(10); - x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9, 2, 3, 4; - q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3; - ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf; + ArrayType x(11), q(11), res(11), ref(11); + x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9, 2, 3, 4, 2000; + q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3, 2000; + ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf, 0; CALL_SUBTEST( verify_component_wise(ref, ref); ); CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); ); CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); ); -- GitLab From fab848d4f78a515f6aec72a45e383f593b14ea6d Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 16 Mar 2022 00:08:16 +0000 Subject: [PATCH 165/266] Remove workarounds for bad GCC-4 warnings (cherry picked from commit 514f90c9ffd161a17cde859fe686a70e8a6983fc) --- Eigen/src/Core/Dot.h | 11 +++-------- Eigen/src/Core/util/Meta.h | 3 +-- Eigen/src/SparseCore/SparseBlock.h | 7 +------ 3 files changed, 5 insertions(+), 16 deletions(-) diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 5c3441b92..abac7ad48 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -18,14 +18,9 @@ namespace internal { // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE // looking at the static assertions. Thus this is a trick to get better compile errors. template + bool NeedToTranspose = T::IsVectorAtCompileTime && U::IsVectorAtCompileTime && + ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1) || + (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))> struct dot_nocheck { typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 81ae2a32d..3a0e5677e 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -696,8 +696,7 @@ struct has_binary_operator template Y))) > - // use ?: instead of || just to shut up a stupid gcc 4.3 warning + bool Done = ((SupX - InfX) <= 1 || ((SupX * SupX <= Y) && ((SupX + 1) * (SupX + 1) > Y)))> class meta_sqrt { enum { diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 5b4f6cc9f..c16caec70 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -429,12 +429,7 @@ struct unary_evaluator, IteratorBa enum { IsRowMajor = XprType::IsRowMajor, - - OuterVector = (BlockCols==1 && ArgType::IsRowMajor) - | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&". - // revert to || as soon as not needed anymore. - (BlockRows==1 && !ArgType::IsRowMajor), - + OuterVector = (BlockCols == 1 && ArgType::IsRowMajor) || (BlockRows == 1 && !ArgType::IsRowMajor), CoeffReadCost = evaluator::CoeffReadCost, Flags = XprType::Flags }; -- GitLab From 3a4a4e9fdec929cc1f89c3f7542b67704dc1a252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 16 Mar 2022 17:33:53 +0000 Subject: [PATCH 166/266] Disable schur non-convergence test. (cherry picked from commit 01b5bc48ccf9ff24ace0f2074403ef9514c2f442) --- test/schur_complex.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/schur_complex.cpp b/test/schur_complex.cpp index 03e17e81d..26acb8c3a 100644 --- a/test/schur_complex.cpp +++ b/test/schur_complex.cpp @@ -54,7 +54,8 @@ template void schur(int size = MatrixType::ColsAtCompileTim VERIFY_IS_EQUAL(cs3.matrixT(), cs1.matrixT()); VERIFY_IS_EQUAL(cs3.matrixU(), cs1.matrixU()); cs3.setMaxIterations(1).compute(A); - VERIFY_IS_EQUAL(cs3.info(), size > 1 ? NoConvergence : Success); + // The schur decomposition does often converge with a single iteration. + // VERIFY_IS_EQUAL(cs3.info(), size > 1 ? NoConvergence : Success); VERIFY_IS_EQUAL(cs3.getMaxIterations(), 1); MatrixType Atriangular = A; -- GitLab From 6469fbf93ad44527fa9208ea42661f34fcf44e25 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 16 Mar 2022 21:46:04 +0000 Subject: [PATCH 167/266] Work around g++-10 docker issue for geo_orthomethods_4. (cherry picked from commit 9deaa19121e9c64a5cbd81b14feffdb7632646eb) --- test/geo_orthomethods.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/geo_orthomethods.cpp b/test/geo_orthomethods.cpp index b7b660740..5f7ddb91f 100644 --- a/test/geo_orthomethods.cpp +++ b/test/geo_orthomethods.cpp @@ -73,8 +73,9 @@ template void orthomethods_3() // check mixed product typedef Matrix RealVector3; RealVector3 rv1 = RealVector3::Random(); - VERIFY_IS_APPROX(v1.cross(rv1.template cast()), v1.cross(rv1)); - VERIFY_IS_APPROX(rv1.template cast().cross(v1), rv1.cross(v1)); + v2 = rv1.template cast(); + VERIFY_IS_APPROX(v1.cross(v2), v1.cross(rv1)); + VERIFY_IS_APPROX(v2.cross(v1), rv1.cross(v1)); } template void orthomethods(int size=Size) -- GitLab From c473d69d2260f4b4c5177a92219b0215433150a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=98ystein=20S=C3=B8rensen?= Date: Thu, 17 Mar 2022 14:52:07 +0000 Subject: [PATCH 168/266] Completed a missing parenthesis in tutorial. (cherry picked from commit c062983464aec44355cbf18d3cb3e8d6e135d821) --- doc/TutorialSlicingIndexing.dox | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox index 98ace43e4..d42e70f2b 100644 --- a/doc/TutorialSlicingIndexing.dox +++ b/doc/TutorialSlicingIndexing.dox @@ -72,7 +72,7 @@ Here are some examples for a 2D array/matrix \c A and a 1D array/vector \c v. %Block starting at \c i,j having \c m rows, and \c n columns - \code A(seqN(i,m), seqN(i,n) \endcode + \code A(seqN(i,m), seqN(i,n)) \endcode \code A.block(i,j,m,n) \endcode -- GitLab From 5cb2dfec1d5c0376938df074e5488606c1178269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tobias=20Schl=C3=BCter?= Date: Thu, 17 Mar 2022 15:28:12 +0000 Subject: [PATCH 169/266] Fix RowMajorBit <-> RowMajor mixup. (cherry picked from commit 40eb34bc5dea506fdbb2508c35d9965bb8c68013) --- Eigen/src/SVD/UpperBidiagonalization.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index 997defc47..5afebef9e 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h @@ -161,13 +161,13 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename NumTraits::Literal Literal; - enum { StorageOrder = traits::Flags & RowMajorBit }; - typedef InnerStride ColInnerStride; - typedef InnerStride RowInnerStride; + enum { StorageOrder = (traits::Flags & RowMajorBit) ? RowMajor : ColMajor }; + typedef InnerStride ColInnerStride; + typedef InnerStride RowInnerStride; typedef Ref, 0, ColInnerStride> SubColumnType; typedef Ref, 0, RowInnerStride> SubRowType; typedef Ref > SubMatType; - + Index brows = A.rows(); Index bcols = A.cols(); @@ -293,7 +293,7 @@ void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagona Index size = (std::min)(rows, cols); // X and Y are work space - enum { StorageOrder = traits::Flags & RowMajorBit }; + enum { StorageOrder = (traits::Flags & RowMajorBit) ? RowMajor : ColMajor }; Matrix Date: Thu, 17 Mar 2022 20:50:26 +0000 Subject: [PATCH 170/266] Work around MSVC compiler bug dropping `const`. (cherry picked from commit 3ca1228d451b5d156de6efe2e4514f4ec59c5ed4) --- Eigen/src/Core/Diagonal.h | 5 +++-- Eigen/src/Core/Transpose.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index 3112d2c16..91f4a6eae 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -191,7 +191,8 @@ MatrixBase::diagonal() /** This is the const version of diagonal(). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline +const typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -217,7 +218,7 @@ MatrixBase::diagonal(Index index) /** This is the const version of diagonal(Index). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline const typename MatrixBase::ConstDiagonalDynamicIndexReturnType MatrixBase::diagonal(Index index) const { return ConstDiagonalDynamicIndexReturnType(derived(), index); diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 2bc658f40..dfda90afd 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -191,7 +191,7 @@ DenseBase::transpose() * \sa transposeInPlace(), adjoint() */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename DenseBase::ConstTransposeReturnType +const typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); -- GitLab From 973b04f3e14e1e454c7702e52384ad02179565c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 18 Mar 2022 16:04:53 +0000 Subject: [PATCH 171/266] Fix AVX512 builds with MSVC. (cherry picked from commit 9a14d91a9909cc430638ac750d323df10194b84e) --- CMakeLists.txt | 10 +++++++++- Eigen/src/Core/arch/AVX512/PacketMath.h | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f40cf7738..af95353f8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -363,11 +363,19 @@ else() endif() option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF) - if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) + option(EIGEN_TEST_AVX2 "Enable/Disable FMA/AVX2 in tests/examples" OFF) + if((EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) OR EIGEN_TEST_AVX2) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") message(STATUS "Enabling FMA/AVX2 in tests/examples") endif() + option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) + option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF) + if(EIGEN_TEST_AVX512 OR EIGEN_TEST_AVX512DQ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512") + message(STATUS "Enabling AVX512 in tests/examples") + endif() + endif() option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 96d85ff20..75c18a23a 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -693,7 +693,7 @@ EIGEN_STRONG_INLINE Packet8d pload(const double* from) { template <> EIGEN_STRONG_INLINE Packet16i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( - reinterpret_cast(from)); + reinterpret_cast(from)); } template <> -- GitLab From ac78f84b729602f0c2a5c5f31e37f2147622edef Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 29 Mar 2022 14:20:13 -0700 Subject: [PATCH 172/266] Eliminate trace unused warning. (cherry picked from commit 9bc9992dd37e0379be888186a234b7641af306f7) --- unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h | 1 + 1 file changed, 1 insertion(+) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 926ecdd38..bbd2ff332 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -134,6 +134,7 @@ struct TensorEvaluator, Device> } } + EIGEN_ONLY_USED_FOR_DEBUG(num_distinct_reduce_dims); eigen_assert(num_distinct_reduce_dims == NumReducedDims); // Compute the dimensions of the result. -- GitLab From 8a21df2d9cc2812bd487df2b7e3d7bce0d9bdfd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 30 Mar 2022 18:35:32 +0000 Subject: [PATCH 173/266] Disable f16c scalar conversions for MSVC. (cherry picked from commit 73b2c13bf2d4c8192ce1cdf7ceeb8d098cfe6b71) --- Eigen/src/Core/arch/Default/Half.h | 14 ++++++++++++-- test/half_float.cpp | 25 ++++++++++++++----------- 2 files changed, 26 insertions(+), 13 deletions(-) diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 9f8e8cc1e..2eb4bdba5 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -534,7 +534,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { #elif defined(EIGEN_HAS_FP16_C) __half_raw h; - h.x = _cvtss_sh(ff, 0); + #if EIGEN_COMP_MSVC + // MSVC does not have scalar instructions. + h.x =_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0); + #else + h.x = _cvtss_sh(ff, 0); + #endif return h; #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) @@ -595,7 +600,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) - return _cvtsh_ss(h.x); + #if EIGEN_COMP_MSVC + // MSVC does not have scalar instructions. + return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x))); + #else + return _cvtsh_ss(h.x); + #endif #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) return static_cast(h.x); #else diff --git a/test/half_float.cpp b/test/half_float.cpp index 729de1bc7..ffb3215b9 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -224,6 +224,8 @@ void test_comparison() void test_basic_functions() { + const float PI = static_cast(EIGEN_PI); + VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); @@ -251,8 +253,8 @@ void test_basic_functions() VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); VERIFY_IS_EQUAL(float(exp(half(0.0f))), 1.0f); - VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI)); - VERIFY_IS_APPROX(float(exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI)); + VERIFY_IS_APPROX(float(numext::exp(half(PI))), 20.f + PI); + VERIFY_IS_APPROX(float(exp(half(PI))), 20.f + PI); VERIFY_IS_EQUAL(float(numext::expm1(half(0.0f))), 0.0f); VERIFY_IS_EQUAL(float(expm1(half(0.0f))), 0.0f); @@ -277,25 +279,26 @@ void test_basic_functions() void test_trigonometric_functions() { + const float PI = static_cast(EIGEN_PI); VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f))); VERIFY_IS_APPROX(cos(half(0.0f)), half(cosf(0.0f))); - VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI))); - // VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2))); - // VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::cos(half(PI)), half(cosf(PI))); + // VERIFY_IS_APPROX(numext::cos(half(PI/2)), half(cosf(PI/2))); + // VERIFY_IS_APPROX(numext::cos(half(3*PI/2)), half(cosf(3*PI/2))); VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f))); VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f))); VERIFY_IS_APPROX(sin(half(0.0f)), half(sinf(0.0f))); - // VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI))); - VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2))); + // VERIFY_IS_APPROX(numext::sin(half(PI)), half(sinf(PI))); + VERIFY_IS_APPROX(numext::sin(half(PI/2)), half(sinf(PI/2))); + VERIFY_IS_APPROX(numext::sin(half(3*PI/2)), half(sinf(3*PI/2))); VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f))); VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f))); VERIFY_IS_APPROX(tan(half(0.0f)), half(tanf(0.0f))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2))); - //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2))); + // VERIFY_IS_APPROX(numext::tan(half(PI)), half(tanf(PI))); + // VERIFY_IS_APPROX(numext::tan(half(PI/2)), half(tanf(PI/2))); + //VERIFY_IS_APPROX(numext::tan(half(3*PI/2)), half(tanf(3*PI/2))); VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f))); } -- GitLab From 8e7bd5397fe93da01805ed840e9c2155122cb2b3 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Sat, 26 Mar 2022 14:39:31 +0200 Subject: [PATCH 174/266] fixed order of arguments in blas syrk (cherry picked from commit 1ddd3e29cbdf0ff5e06ac974ee0607f75c28fe36) --- blas/level3_impl.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 6dd6338b4..66216c964 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -362,18 +362,18 @@ int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, DenseIndex, const Scalar&, internal::level3_blocking&); static const functype func[8] = { // array index: NOTR | (UP << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: TR | (UP << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: ADJ | (UP << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), 0, // array index: NOTR | (LO << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: TR | (LO << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: ADJ | (LO << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), 0 }; #endif -- GitLab From 86d958e8f25a061ecbfcb0470186162ed7794eb6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 1 Apr 2022 17:00:24 +0000 Subject: [PATCH 175/266] Consider inf/nan in scalar test_isApprox. (cherry picked from commit 0c859cf35d67fe2abf8bc79810e88732429fc88c) --- test/main.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/test/main.h b/test/main.h index 07f3794ac..3dd094249 100644 --- a/test/main.h +++ b/test/main.h @@ -422,7 +422,13 @@ template<> inline long double test_precision >() { ret #define EIGEN_TEST_SCALAR_TEST_OVERLOAD(TYPE) \ inline bool test_isApprox(TYPE a, TYPE b) \ - { return internal::isApprox(a, b, test_precision()); } \ + { return numext::equal_strict(a, b) || \ + ((numext::isnan)(a) && (numext::isnan)(b)) || \ + (internal::isApprox(a, b, test_precision())); } \ + inline bool test_isCwiseApprox(TYPE a, TYPE b, bool exact) \ + { return numext::equal_strict(a, b) || \ + ((numext::isnan)(a) && (numext::isnan)(b)) || \ + (!exact && internal::isApprox(a, b, test_precision())); } \ inline bool test_isMuchSmallerThan(TYPE a, TYPE b) \ { return internal::isMuchSmallerThan(a, b, test_precision()); } \ inline bool test_isApproxOrLessThan(TYPE a, TYPE b) \ -- GitLab From af912a7b5c2c9b0a567f1802fce86c7a3fae5560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 8 Apr 2022 18:05:32 +0000 Subject: [PATCH 176/266] Fix MSVC+CUDA issues. (cherry picked from commit 5ed7a86ae96d411c450fb190f5a725f38f2aea9d) --- Eigen/src/Core/DenseBase.h | 4 ++-- Eigen/src/Core/Diagonal.h | 16 ++++++++-------- Eigen/src/Core/MatrixBase.h | 18 ++++++------------ Eigen/src/Core/SolverBase.h | 12 ++++++------ Eigen/src/Core/Transpose.h | 2 +- Eigen/src/SparseCore/SparseMatrixBase.h | 2 +- .../Eigen/CXX11/src/Tensor/TensorBlock.h | 4 ++-- .../CXX11/src/Tensor/TensorContractionGpu.h | 2 +- 8 files changed, 27 insertions(+), 33 deletions(-) diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 9b16db68d..cdd0f5f16 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -324,9 +324,9 @@ template class DenseBase typedef Transpose TransposeReturnType; EIGEN_DEVICE_FUNC TransposeReturnType transpose(); - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; EIGEN_DEVICE_FUNC - ConstTransposeReturnType transpose() const; + const ConstTransposeReturnType transpose() const; EIGEN_DEVICE_FUNC void transposeInPlace(); diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h index 91f4a6eae..ad5bccd71 100644 --- a/Eigen/src/Core/Diagonal.h +++ b/Eigen/src/Core/Diagonal.h @@ -210,18 +210,18 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline Diagonal MatrixBase::diagonal(Index index) { - return DiagonalDynamicIndexReturnType(derived(), index); + return Diagonal(derived(), index); } /** This is the const version of diagonal(Index). */ template -EIGEN_DEVICE_FUNC inline const typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline const Diagonal MatrixBase::diagonal(Index index) const { - return ConstDiagonalDynamicIndexReturnType(derived(), index); + return Diagonal(derived(), index); } /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this @@ -238,20 +238,20 @@ MatrixBase::diagonal(Index index) const template template EIGEN_DEVICE_FUNC -inline typename MatrixBase::template DiagonalIndexReturnType::Type +inline Diagonal MatrixBase::diagonal() { - return typename DiagonalIndexReturnType::Type(derived()); + return Diagonal(derived()); } /** This is the const version of diagonal(). */ template template EIGEN_DEVICE_FUNC -inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type +inline const Diagonal MatrixBase::diagonal() const { - return typename ConstDiagonalIndexReturnType::Type(derived()); + return Diagonal(derived()); } } // end namespace Eigen diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 45c3a596e..d93a7e377 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -206,28 +206,22 @@ template class MatrixBase EIGEN_DEVICE_FUNC DiagonalReturnType diagonal(); - typedef typename internal::add_const >::type ConstDiagonalReturnType; + typedef Diagonal ConstDiagonalReturnType; EIGEN_DEVICE_FUNC - ConstDiagonalReturnType diagonal() const; - - template struct DiagonalIndexReturnType { typedef Diagonal Type; }; - template struct ConstDiagonalIndexReturnType { typedef const Diagonal Type; }; + const ConstDiagonalReturnType diagonal() const; template EIGEN_DEVICE_FUNC - typename DiagonalIndexReturnType::Type diagonal(); + Diagonal diagonal(); template EIGEN_DEVICE_FUNC - typename ConstDiagonalIndexReturnType::Type diagonal() const; - - typedef Diagonal DiagonalDynamicIndexReturnType; - typedef typename internal::add_const >::type ConstDiagonalDynamicIndexReturnType; + const Diagonal diagonal() const; EIGEN_DEVICE_FUNC - DiagonalDynamicIndexReturnType diagonal(Index index); + Diagonal diagonal(Index index); EIGEN_DEVICE_FUNC - ConstDiagonalDynamicIndexReturnType diagonal(Index index) const; + const Diagonal diagonal(Index index) const; template struct TriangularViewReturnType { typedef TriangularView Type; }; template struct ConstTriangularViewReturnType { typedef const TriangularView Type; }; diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h index 501461042..e38b3d5ad 100644 --- a/Eigen/src/Core/SolverBase.h +++ b/Eigen/src/Core/SolverBase.h @@ -110,7 +110,7 @@ class SolverBase : public EigenBase } /** \internal the return type of transpose() */ - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; /** \returns an expression of the transposed of the factored matrix. * * A typical usage is to solve for the transposed problem A^T x = b: @@ -118,16 +118,16 @@ class SolverBase : public EigenBase * * \sa adjoint(), solve() */ - inline ConstTransposeReturnType transpose() const + inline const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); } /** \internal the return type of adjoint() */ typedef typename internal::conditional::IsComplex, - CwiseUnaryOp, ConstTransposeReturnType>, - ConstTransposeReturnType - >::type AdjointReturnType; + CwiseUnaryOp, const ConstTransposeReturnType>, + const ConstTransposeReturnType + >::type AdjointReturnType; /** \returns an expression of the adjoint of the factored matrix * * A typical usage is to solve for the adjoint problem A' x = b: @@ -137,7 +137,7 @@ class SolverBase : public EigenBase * * \sa transpose(), solve() */ - inline AdjointReturnType adjoint() const + inline const AdjointReturnType adjoint() const { return AdjointReturnType(derived().transpose()); } diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index dfda90afd..741504d95 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -178,7 +178,7 @@ template class TransposeImpl * \sa transposeInPlace(), adjoint() */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Transpose +typename DenseBase::TransposeReturnType DenseBase::transpose() { return TransposeReturnType(derived()); diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index 229449f02..c8151ed09 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -113,7 +113,7 @@ template class SparseMatrixBase Transpose >::type AdjointReturnType; typedef Transpose TransposeReturnType; - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; // FIXME storage order do not match evaluator storage order typedef SparseMatrix PlainObject; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 1e55d12c4..243b3fb7b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -242,7 +242,7 @@ class TensorBlockDescriptor { const DestinationBufferKind& kind() const { return m_kind; } private: - friend class TensorBlockDescriptor; + friend class TensorBlockDescriptor; DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} @@ -706,7 +706,7 @@ class TensorMaterializedBlock { } private: - friend class TensorMaterializedBlock; + friend class TensorMaterializedBlock; Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index c81803827..bf9194d88 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -233,7 +233,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } \ } \ -#define writeRegToShmem(_) \ +#define writeRegToShmem() \ lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ \ -- GitLab From 5cb7505a446a3bf758eb82f05d3d56a5ac80f933 Mon Sep 17 00:00:00 2001 From: Rohan Ghige Date: Wed, 18 May 2022 14:42:57 +0000 Subject: [PATCH 177/266] Fix 'Incorrect reference code in STL_interface.hh for ata_product' eigen/isses/2425 (cherry picked from commit 798fc1c577fea1b75e511ec63a72237d9122b358) --- bench/btl/libs/STL/STL_interface.hh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bench/btl/libs/STL/STL_interface.hh b/bench/btl/libs/STL/STL_interface.hh index 16658c4ba..5b391c6ef 100644 --- a/bench/btl/libs/STL/STL_interface.hh +++ b/bench/btl/libs/STL/STL_interface.hh @@ -84,9 +84,12 @@ public : for (int j=0;j=j) + { for (int k=0;k Date: Fri, 20 May 2022 08:17:07 -0700 Subject: [PATCH 178/266] Fix BDCSVD condition for failing with numerical issue. (cherry picked from commit 481a4a8c319640a3689be11c66b38cf1f9dc50b2) --- Eigen/src/SVD/BDCSVD.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 6dcb986c1..79a6562b7 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -1035,7 +1035,7 @@ void BDCSVD::perturbCol0 #endif // Avoid index out of bounds. // Will end up setting zhat(k) = 0. - if (l == 0) { + if (i >= k && l == 0) { m_info = NumericalIssue; prod = 0; break; -- GitLab From 80c5b8b3c3114113abdc5f78b4bce8f1e9ddb4a1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 7 Jun 2022 17:06:17 +0000 Subject: [PATCH 179/266] Fix ambiguous comparisons for c++20 (again again) (cherry picked from commit 8c2e0e3cb8c6ddcd828d6f1d2062a243c0dc9948) --- unsupported/Eigen/CXX11/src/Tensor/TensorBase.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 35b6458e5..fcc7411af 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -513,34 +513,34 @@ class TensorBase // Comparisons and tests. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<(const OtherDerived& other) const { + operator<(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<=(const OtherDerived& other) const { + operator<=(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>(const OtherDerived& other) const { + operator>(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>=(const OtherDerived& other) const { + operator>=(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator==(const OtherDerived& other) const { + operator==(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator!=(const OtherDerived& other) const { + operator!=(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } -- GitLab From 3df7d7fec9519d46c2674fe1a278f3bdcbae9b1b Mon Sep 17 00:00:00 2001 From: sfalmo Date: Tue, 7 Jun 2022 17:28:19 +0000 Subject: [PATCH 180/266] Fix row vs column vector typo in Matrix class tutorial (cherry picked from commit 9960a304222919b5ab419e2f66196fd4500c2af7) --- doc/TutorialMatrixClass.dox | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/TutorialMatrixClass.dox b/doc/TutorialMatrixClass.dox index 2c452220f..9b6b4b1f0 100644 --- a/doc/TutorialMatrixClass.dox +++ b/doc/TutorialMatrixClass.dox @@ -111,9 +111,9 @@ Vector4d c(5.0, 6.0, 7.0, 8.0); If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized by passing an arbitrary number of coefficients: \code -Vector2i a(1, 2); // A column vector containing the elements {1, 2} -Matrix b {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5} -Matrix c = {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5} +Vector2i a(1, 2); // A column-vector containing the elements {1, 2} +Matrix b {1, 2, 3, 4, 5}; // A column-vector containing the elements {1, 2, 3, 4, 5} +Matrix c = {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5} \endcode In the general case of matrices and vectors with either fixed or runtime sizes, -- GitLab From b9ac284e52e4fac2ca7a18e7fa5047bbfb47544b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 21 Jun 2022 16:29:59 +0000 Subject: [PATCH 181/266] Use numext::sqrt in Householder.h. (cherry picked from commit 0e083b172e2e7d7527f15d5c5e8703a8e0f17109) --- Eigen/src/Householder/Householder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h index 5bc037f00..d8984a347 100644 --- a/Eigen/src/Householder/Householder.h +++ b/Eigen/src/Householder/Householder.h @@ -69,7 +69,7 @@ void MatrixBase::makeHouseholder( Scalar& tau, RealScalar& beta) const { - using std::sqrt; + using numext::sqrt; using numext::conj; EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart) -- GitLab From a11bdf39657400ab1a727ec908a66b777e466056 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 24 Jun 2022 15:10:36 +0000 Subject: [PATCH 182/266] Skip f16/bf16 bessel specializations on AVX512 if unavailable. (cherry picked from commit 8ed3b9dcd6dd2e58ec0ad27438d09a90c72e549a) --- .../Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h index 7dd3c3e5b..909b08e16 100644 --- a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h +++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h @@ -4,6 +4,9 @@ namespace Eigen { namespace internal { +// Bessel functions only available for some compilers. +#if EIGEN_HAS_AVX512_MATH + F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0) @@ -40,6 +43,8 @@ BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0) F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1) +#endif + } // namespace internal } // namespace Eigen -- GitLab From f55a112cb1283b720f8bddb116708a976e128360 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Sat, 9 Jul 2022 04:56:36 +0000 Subject: [PATCH 183/266] Fix ODR violations. (cherry picked from commit bb51d9f4fa3cf1114348b9180640d6da7d3964f9) --- Eigen/src/SparseLU/SparseLU_Structs.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/SparseLU/SparseLU_Structs.h b/Eigen/src/SparseLU/SparseLU_Structs.h index cf5ec449b..16a0c41f4 100644 --- a/Eigen/src/SparseLU/SparseLU_Structs.h +++ b/Eigen/src/SparseLU/SparseLU_Structs.h @@ -70,8 +70,8 @@ #define EIGEN_LU_STRUCTS namespace Eigen { namespace internal { - -typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; + +enum MemType {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL}; template struct LU_GlobalLU_t { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 4f7fd340e..a8a535a7f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -254,10 +254,10 @@ struct nested > // the SAME case. // When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0, // Pc=0. -typedef enum { +enum PaddingType { PADDING_VALID = 1, PADDING_SAME = 2 -} PaddingType; +}; } // end namespace Eigen -- GitLab From ea57f9b78f54c6de675d39dd81796d08e9593ca6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Sat, 9 Jul 2022 23:57:09 +0000 Subject: [PATCH 184/266] AutoDiff depends on Core, so include appropriate header. (cherry picked from commit e1165dbf9a16527ab085bec2749b02096fa1b584) --- unsupported/Eigen/AutoDiff | 2 ++ 1 file changed, 2 insertions(+) diff --git a/unsupported/Eigen/AutoDiff b/unsupported/Eigen/AutoDiff index 7a4ff460c..e10875e71 100644 --- a/unsupported/Eigen/AutoDiff +++ b/unsupported/Eigen/AutoDiff @@ -10,6 +10,8 @@ #ifndef EIGEN_AUTODIFF_MODULE #define EIGEN_AUTODIFF_MODULE +#include "../../Eigen/Core" + namespace Eigen { /** -- GitLab From 6aaa45db5fd7cda65582b41a83c0fea998c34e80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 22 Jul 2022 02:27:42 +0000 Subject: [PATCH 185/266] Include immintrin.h header for enscripten. (cherry picked from commit 34780d8bd13d0af0cf17a22789ef286e8512594d) --- Eigen/src/Core/util/ConfigureVectorization.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 2d12e1d17..259ef0ca1 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -339,7 +339,7 @@ extern "C" { // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #if EIGEN_COMP_ICC >= 1110 + #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN #include #else #include -- GitLab From a5469a6f0f7d7ca65e37f6801552fb28eb8da984 Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Fri, 29 Jul 2022 18:02:51 +0000 Subject: [PATCH 186/266] Avoid including with EIGEN_NO_IO (cherry picked from commit b7668c0371054a3a938eeab32a5c10d24c1ea4fc) --- Eigen/Core | 2 +- Eigen/src/Core/arch/Default/Half.h | 2 -- Eigen/src/SparseCore/SparseMatrix.h | 2 ++ Eigen/src/SparseCore/SparseMatrixBase.h | 3 ++- Eigen/src/SparseCore/SparseVector.h | 2 ++ Eigen/src/SparseLU/SparseLU.h | 7 +++++-- 6 files changed, 12 insertions(+), 6 deletions(-) diff --git a/Eigen/Core b/Eigen/Core index 3c03519fe..1e53ba49b 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -83,8 +83,8 @@ #include #include #include -#include #ifndef EIGEN_NO_IO + #include #include #endif #include diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 2eb4bdba5..e4a17cbfa 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -36,8 +36,6 @@ #ifndef EIGEN_HALF_H #define EIGEN_HALF_H -#include - #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) // When compiling with GPU support, the "__half_raw" base class as well as // some other routines are defined in the GPU compiler header files diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 616b4a0c2..9fc06b5e7 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -793,6 +793,7 @@ class SparseMatrix template EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase& other); +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m) { EIGEN_DBG_SPARSE( @@ -837,6 +838,7 @@ class SparseMatrix s << static_cast&>(m); return s; } +#endif /** Destructor */ inline ~SparseMatrix() diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index c8151ed09..417a2365e 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -214,7 +214,7 @@ template class SparseMatrixBase inline void assignGeneric(const OtherDerived& other); public: - +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m) { typedef typename Derived::Nested Nested; @@ -263,6 +263,7 @@ template class SparseMatrixBase } return s; } +#endif template Derived& operator+=(const SparseMatrixBase& other); diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 05779be68..106925be4 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -329,6 +329,7 @@ class SparseVector } #endif +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseVector& m) { for (Index i=0; i::factorize(const MatrixType& matrix) info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu); if ( info ) { - m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR ... ZERO COLUMN AT "; + m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR"; +#ifndef EIGEN_NO_IO std::ostringstream returnInfo; - returnInfo << info; + returnInfo << " ... ZERO COLUMN AT "; + returnInfo << info; m_lastError += returnInfo.str(); +#endif m_info = NumericalIssue; m_factorizationIsOk = false; return; -- GitLab From 61efca2e907c53ab517408a73bc962c07b1b369f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 29 Jul 2022 20:17:23 +0000 Subject: [PATCH 187/266] Use numext::sqrt in ConjugateGradient. (cherry picked from commit 7896c7dc6bd1bd34dd9636bdd3426e3c28e6a246) --- Eigen/src/IterativeLinearSolvers/ConjugateGradient.h | 6 ++---- test/vectorization_logic.cpp | 6 +++--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 5d8c6b433..c3ca0ad54 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -29,8 +29,6 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, const Preconditioner& precond, Index& iters, typename Dest::RealScalar& tol_error) { - using std::sqrt; - using std::abs; typedef typename Dest::RealScalar RealScalar; typedef typename Dest::Scalar Scalar; typedef Matrix VectorType; @@ -56,7 +54,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, if (residualNorm2 < threshold) { iters = 0; - tol_error = sqrt(residualNorm2 / rhsNorm2); + tol_error = numext::sqrt(residualNorm2 / rhsNorm2); return; } @@ -86,7 +84,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, p = z + beta * p; // update search direction i++; } - tol_error = sqrt(residualNorm2 / rhsNorm2); + tol_error = numext::sqrt(residualNorm2 / rhsNorm2); iters = i; } diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index f5c86afd3..b5464992e 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -279,10 +279,10 @@ struct vectorization_logic_half // which causes many of these tests to fail since they don't vectorize if // EIGEN_UNALIGNED_VECTORIZE is 0 (the matrix is assumed unaligned). // Adjust the matrix sizes to account for these alignment issues. - constexpr int PacketBytes = sizeof(Scalar)*PacketSize; - constexpr int MinVSize = EIGEN_UNALIGNED_VECTORIZE ? PacketSize + enum { PacketBytes = sizeof(Scalar)*PacketSize }; + enum { MinVSize = EIGEN_UNALIGNED_VECTORIZE ? PacketSize : PacketBytes >= EIGEN_MIN_ALIGN_BYTES ? PacketSize - : (EIGEN_MIN_ALIGN_BYTES + sizeof(Scalar) - 1) / sizeof(Scalar); + : (EIGEN_MIN_ALIGN_BYTES + sizeof(Scalar) - 1) / sizeof(Scalar) }; typedef Matrix Vector1; typedef Matrix Matrix11; -- GitLab From a9490cd3c51763e8f17c46c9723aeb17ccb32bf2 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 8 Aug 2022 18:48:36 +0000 Subject: [PATCH 188/266] Fix code and unit test for a few corner cases in vectorized pow() (cherry picked from commit 7a87ed1b6a49bd0067856dcba9ad9a3a46186220) --- .../arch/Default/GenericPacketMathFunctions.h | 73 +++++++++---------- test/array_cwise.cpp | 6 +- 2 files changed, 36 insertions(+), 43 deletions(-) diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index c9fbaf68b..95fb686a1 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -1443,39 +1443,40 @@ EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) { } // Generic implementation of pow(x,y). -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet generic_pow(const Packet& x, const Packet& y) { +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) { typedef typename unpacket_traits::type Scalar; const Packet cst_pos_inf = pset1(NumTraits::infinity()); + const Packet cst_neg_inf = pset1(-NumTraits::infinity()); const Packet cst_zero = pset1(Scalar(0)); const Packet cst_one = pset1(Scalar(1)); const Packet cst_nan = pset1(NumTraits::quiet_NaN()); const Packet abs_x = pabs(x); // Predicates for sign and magnitude of x. - const Packet x_is_zero = pcmp_eq(x, cst_zero); - const Packet x_is_neg = pcmp_lt(x, cst_zero); + const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero); + const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf); + const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero); + const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero); const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); - const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); + const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x); const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one); - const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); - const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); + const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); + const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x)); // Predicates for sign and magnitude of y. + const Packet abs_y = pabs(y); const Packet y_is_one = pcmp_eq(y, cst_one); - const Packet y_is_zero = pcmp_eq(y, cst_zero); + const Packet abs_y_is_zero = pcmp_eq(abs_y, cst_zero); const Packet y_is_neg = pcmp_lt(y, cst_zero); - const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg)); + const Packet y_is_pos = pandnot(ptrue(y), por(abs_y_is_zero, y_is_neg)); const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y)); - const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf); + const Packet abs_y_is_inf = pcmp_eq(abs_y, cst_pos_inf); EIGEN_CONSTEXPR Scalar huge_exponent = - (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / - NumTraits::epsilon(); + (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits::epsilon(); const Packet abs_y_is_huge = pcmp_le(pset1(huge_exponent), pabs(y)); // Predicates for whether y is integer and/or even. @@ -1484,39 +1485,31 @@ Packet generic_pow(const Packet& x, const Packet& y) { const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); // Predicates encoding special cases for the value of pow(x,y) - const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), - y_is_int), - abs_y_is_inf); - const Packet pow_is_one = por(por(x_is_one, y_is_zero), - pand(x_is_neg_one, - por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf); const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan)); - const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), - pand(abs_x_is_inf, y_is_neg)), - pand(pand(abs_x_is_lt_one, abs_y_is_huge), - y_is_pos)), - pand(pand(abs_x_is_gt_one, abs_y_is_huge), - y_is_neg)); - const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), - pand(abs_x_is_inf, y_is_pos)), - pand(pand(abs_x_is_lt_one, abs_y_is_huge), - y_is_neg)), - pand(pand(abs_x_is_gt_one, abs_y_is_huge), - y_is_pos)); + const Packet pow_is_one = + por(por(x_is_one, abs_y_is_zero), pand(x_is_neg_one, por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet pow_is_zero = por(por(por(pand(abs_x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg)); + const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos)); + const Packet inf_val = + pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even), + cst_neg_inf, cst_pos_inf); // General computation of pow(x,y) for positive x or negative x and integer y. const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even); const Packet pow_abs = generic_pow_impl(abs_x, y); - return pselect(y_is_one, x, - pselect(pow_is_one, cst_one, - pselect(pow_is_nan, cst_nan, - pselect(pow_is_inf, cst_pos_inf, - pselect(pow_is_zero, cst_zero, - pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); + return pselect( + y_is_one, x, + pselect(pow_is_one, cst_one, + pselect(pow_is_nan, cst_nan, + pselect(pow_is_inf, inf_val, + pselect(pow_is_zero, cst_zero, pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); } - - /* polevl (modified for Eigen) * * Evaluate polynomial diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index 0cc438b39..238883090 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -72,9 +72,9 @@ void pow_test() { for (int j = 0; j < num_cases; ++j) { Scalar e = static_cast(std::pow(x(i,j), y(i,j))); Scalar a = actual(i, j); - bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((numext::isnan)(a) && (numext::isnan)(e)); - all_pass &= !fail; - if (fail) { + bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e)); + all_pass &= success; + if (!success) { std::cout << "pow(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl; } } -- GitLab From 33a602eb3720c160c94f080f8986b46bff48bbe0 Mon Sep 17 00:00:00 2001 From: Lexi Bromfield Date: Tue, 9 Aug 2022 20:00:34 +0000 Subject: [PATCH 189/266] Don't double-define Half functions on aarch64 (cherry picked from commit 66ea0c09fdd939ae2741cee1f5a9961b64d5adcd) --- Eigen/src/Core/arch/Default/Half.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index e4a17cbfa..6e2b31f76 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -332,7 +332,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { } #endif -#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); } -- GitLab From 669dc8fadf88d6aa23919feba463f1122f110ccf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 9 Aug 2022 22:42:30 +0000 Subject: [PATCH 190/266] Eliminate bool bitwise warnings. (cherry picked from commit b8e93bf589fa66da404c66c48dc512b3e7484713) --- .../Eigen/CXX11/src/Tensor/TensorIndexList.h | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 2d8c7b903..d3600eab3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -246,7 +246,7 @@ struct tuple_coeff { template EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple& t) { - return ((i == Idx) & is_compile_time_constant::ValType>::value) || + return ((i == Idx) && is_compile_time_constant::ValType>::value) || tuple_coeff::value_known_statically(i, t); } @@ -468,7 +468,7 @@ struct index_statically_eq_impl { template struct index_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) == value); } }; @@ -476,7 +476,7 @@ struct index_statically_eq_impl > { template struct index_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) == value); } }; @@ -492,7 +492,7 @@ struct index_statically_ne_impl { template struct index_statically_ne_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) != value); } }; @@ -500,7 +500,7 @@ struct index_statically_ne_impl > { template struct index_statically_ne_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) != value); } }; @@ -516,7 +516,7 @@ struct index_statically_gt_impl { template struct index_statically_gt_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) > value); } }; @@ -524,7 +524,7 @@ struct index_statically_gt_impl > { template struct index_statically_gt_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) > value); } }; @@ -541,7 +541,7 @@ struct index_statically_lt_impl { template struct index_statically_lt_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) < value); } }; @@ -549,7 +549,7 @@ struct index_statically_lt_impl > { template struct index_statically_lt_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) < value); } }; @@ -566,7 +566,7 @@ struct index_pair_first_statically_eq_impl { template struct index_pair_first_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList().value_known_statically(i) & + return IndexPairList().value_known_statically(i) && (IndexPairList().operator[](i).first == value); } }; @@ -574,7 +574,7 @@ struct index_pair_first_statically_eq_impl struct index_pair_first_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList().value_known_statically(i) & + return IndexPairList().value_known_statically(i) && (IndexPairList().operator[](i).first == value); } }; @@ -591,7 +591,7 @@ struct index_pair_second_statically_eq_impl { template struct index_pair_second_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList().value_known_statically(i) & + return IndexPairList().value_known_statically(i) && (IndexPairList().operator[](i).second == value); } }; @@ -599,7 +599,7 @@ struct index_pair_second_statically_eq_impl struct index_pair_second_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList().value_known_statically(i) & + return IndexPairList().value_known_statically(i) && (IndexPairList().operator[](i).second == value); } }; -- GitLab From d0e2b3e58dae8887673b5e53139c4c10701273fb Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Tue, 16 Aug 2022 18:14:41 +0000 Subject: [PATCH 191/266] Removed unnecessary checks for FP16C (cherry picked from commit 39fcc89798bc54501388348a448ea0e32fa5da7d) --- Eigen/src/Core/arch/AVX512/PacketMath.h | 49 ------------------------- 1 file changed, 49 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 75c18a23a..bf55319fc 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -1433,60 +1433,11 @@ ploadquad(const Eigen::half* from) { } EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { -#ifdef EIGEN_HAS_FP16_C return _mm512_cvtph_ps(a); -#else - EIGEN_ALIGN64 half aux[16]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - float f8(aux[8]); - float f9(aux[9]); - float fa(aux[10]); - float fb(aux[11]); - float fc(aux[12]); - float fd(aux[13]); - float fe(aux[14]); - float ff(aux[15]); - - return _mm512_set_ps( - ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); -#endif } EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { -#ifdef EIGEN_HAS_FP16_C return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); -#else - EIGEN_ALIGN64 float aux[16]; - pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); - half h8(aux[8]); - half h9(aux[9]); - half ha(aux[10]); - half hb(aux[11]); - half hc(aux[12]); - half hd(aux[13]); - half he(aux[14]); - half hf(aux[15]); - - return _mm256_set_epi16( - hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, - h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); -#endif } template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { -- GitLab From 68f35d76b8bcdb19c99b2566d54b63fdb34be75d Mon Sep 17 00:00:00 2001 From: Arthur Date: Thu, 25 Aug 2022 22:05:04 +0000 Subject: [PATCH 192/266] Fix GeneralizedEigenSolver::info() and Asserts (cherry picked from commit a7c1cac18bfef26ec61a73c1619ccf0f9b734745) --- .../src/Eigenvalues/GeneralizedEigenSolver.h | 33 ++++++++--------- test/eigensolver_generalized_real.cpp | 37 +++++++++++++++++++ 2 files changed, 53 insertions(+), 17 deletions(-) diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index 87d789b3f..f9a7627d0 100644 --- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -119,8 +119,8 @@ template class GeneralizedEigenSolver : m_eivec(), m_alphas(), m_betas(), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ() {} @@ -134,8 +134,8 @@ template class GeneralizedEigenSolver : m_eivec(size, size), m_alphas(size), m_betas(size), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ(size), m_tmp(size) {} @@ -156,8 +156,8 @@ template class GeneralizedEigenSolver : m_eivec(A.rows(), A.cols()), m_alphas(A.cols()), m_betas(A.cols()), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ(A.cols()), m_tmp(A.cols()) { @@ -177,7 +177,8 @@ template class GeneralizedEigenSolver * \sa eigenvalues() */ EigenvectorsType eigenvectors() const { - eigen_assert(m_vectorsOkay && "Eigenvectors for GeneralizedEigenSolver were not calculated."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvectors") + eigen_assert(m_computeEigenvectors && "Eigenvectors for GeneralizedEigenSolver were not calculated"); return m_eivec; } @@ -201,7 +202,7 @@ template class GeneralizedEigenSolver */ EigenvalueType eigenvalues() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvalues."); return EigenvalueType(m_alphas,m_betas); } @@ -212,7 +213,7 @@ template class GeneralizedEigenSolver * \sa betas(), eigenvalues() */ ComplexVectorType alphas() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute alphas."); return m_alphas; } @@ -223,7 +224,7 @@ template class GeneralizedEigenSolver * \sa alphas(), eigenvalues() */ VectorType betas() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute betas."); return m_betas; } @@ -254,7 +255,7 @@ template class GeneralizedEigenSolver ComputationInfo info() const { - eigen_assert(m_valuesOkay && "EigenSolver is not initialized."); + eigen_assert(m_isInitialized && "EigenSolver is not initialized."); return m_realQZ.info(); } @@ -277,7 +278,8 @@ template class GeneralizedEigenSolver EigenvectorsType m_eivec; ComplexVectorType m_alphas; VectorType m_betas; - bool m_valuesOkay, m_vectorsOkay; + bool m_computeEigenvectors; + bool m_isInitialized; RealQZ m_realQZ; ComplexVectorType m_tmp; }; @@ -292,8 +294,6 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp using std::abs; eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows()); Index size = A.cols(); - m_valuesOkay = false; - m_vectorsOkay = false; // Reduce to generalized real Schur form: // A = Q S Z and B = Q T Z m_realQZ.compute(A, B, computeEigenvectors); @@ -406,10 +406,9 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp i += 2; } } - - m_valuesOkay = true; - m_vectorsOkay = computeEigenvectors; } + m_computeEigenvectors = computeEigenvectors; + m_isInitialized = true; return *this; } diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp index 95ed431db..a0c99b18a 100644 --- a/test/eigensolver_generalized_real.cpp +++ b/test/eigensolver_generalized_real.cpp @@ -85,6 +85,42 @@ template void generalized_eigensolver_real(const MatrixType } } +template +void generalized_eigensolver_assert() { + GeneralizedEigenSolver eig; + // all raise assert if uninitialized + VERIFY_RAISES_ASSERT(eig.info()); + VERIFY_RAISES_ASSERT(eig.eigenvectors()); + VERIFY_RAISES_ASSERT(eig.eigenvalues()); + VERIFY_RAISES_ASSERT(eig.alphas()); + VERIFY_RAISES_ASSERT(eig.betas()); + + // none raise assert after compute called + eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20)); + VERIFY(eig.info() == Success); + eig.eigenvectors(); + eig.eigenvalues(); + eig.alphas(); + eig.betas(); + + // eigenvectors() raises assert, if eigenvectors were not requested + eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20), false); + VERIFY(eig.info() == Success); + VERIFY_RAISES_ASSERT(eig.eigenvectors()); + eig.eigenvalues(); + eig.alphas(); + eig.betas(); + + // all except info raise assert if realQZ did not converge + eig.setMaxIterations(0); // force real QZ to fail. + eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20)); + VERIFY(eig.info() == NoConvergence); + VERIFY_RAISES_ASSERT(eig.eigenvectors()); + VERIFY_RAISES_ASSERT(eig.eigenvalues()); + VERIFY_RAISES_ASSERT(eig.alphas()); + VERIFY_RAISES_ASSERT(eig.betas()); +} + EIGEN_DECLARE_TEST(eigensolver_generalized_real) { for(int i = 0; i < g_repeat; i++) { @@ -98,6 +134,7 @@ EIGEN_DECLARE_TEST(eigensolver_generalized_real) CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) ); CALL_SUBTEST_3( generalized_eigensolver_real(Matrix()) ); CALL_SUBTEST_4( generalized_eigensolver_real(Matrix2d()) ); + CALL_SUBTEST_5( generalized_eigensolver_assert() ); TEST_SET_BUT_UNUSED_VARIABLE(s) } } -- GitLab From 6576ee4fb1c20a15dc696326f3536f3fd13bab89 Mon Sep 17 00:00:00 2001 From: Gilles Aouizerate Date: Wed, 31 Aug 2022 19:54:42 +0000 Subject: [PATCH 193/266] 2 typos fix in the 3rd table. (cherry picked from commit 94cc83faa1a5603a408790dfb6821cec3c9012da) --- doc/TutorialSlicingIndexing.dox | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox index d42e70f2b..e8c860a83 100644 --- a/doc/TutorialSlicingIndexing.dox +++ b/doc/TutorialSlicingIndexing.dox @@ -129,12 +129,12 @@ Here comes \link Eigen::lastN(SizeType) Eigen::lastN(size) \endlink, and \link E Bottom-right corner of A of size \c m times \c n - \code v(lastN(m), lastN(n)) \endcode + \code A(lastN(m), lastN(n)) \endcode \code A.bottomRightCorner(m,n) \endcode Bottom-right corner of A of size \c m times \c n - \code v(lastN(m), lastN(n)) \endcode + \code A(lastN(m), lastN(n)) \endcode \code A.bottomRightCorner(m,n) \endcode -- GitLab From ab6f39e1e3852677bd772922528cb7fe2e7d4e6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 2 Sep 2022 15:30:13 +0000 Subject: [PATCH 194/266] Fix mixingtypes tests. (cherry picked from commit d816044b6edb666ffe5ee7107762a29153f067c0) --- test/mixingtypes.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index d450dbff8..2af7b8887 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -139,11 +139,12 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_MIX_SCALAR(scd - vd.array() , scd - vd.template cast >().array()); // check scalar powers - VERIFY_MIX_SCALAR( pow(vcf.array(), sf), Eigen::pow(vcf.array(), complex(sf)) ); - VERIFY_MIX_SCALAR( vcf.array().pow(sf) , Eigen::pow(vcf.array(), complex(sf)) ); + // NOTE: scalar exponents use a unary op. + VERIFY_IS_APPROX( pow(vcf.array(), sf), Eigen::pow(vcf.array(), complex(sf)) ); + VERIFY_IS_APPROX( vcf.array().pow(sf) , Eigen::pow(vcf.array(), complex(sf)) ); VERIFY_MIX_SCALAR( pow(sd, vcd.array()), Eigen::pow(complex(sd), vcd.array()) ); - VERIFY_MIX_SCALAR( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast >().array(), scf) ); - VERIFY_MIX_SCALAR( vf.array().pow(scf) , Eigen::pow(vf.template cast >().array(), scf) ); + VERIFY_IS_APPROX( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast >().array(), scf) ); + VERIFY_IS_APPROX( vf.array().pow(scf) , Eigen::pow(vf.template cast >().array(), scf) ); VERIFY_MIX_SCALAR( Eigen::pow(scd, vd.array()), Eigen::pow(scd, vd.template cast >().array()) ); // check dot product -- GitLab From 11dacc4802b971272297e9c8d03beeb09bd9ddec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 2 Sep 2022 16:43:14 +0000 Subject: [PATCH 195/266] Fix some cmake issues. (cherry picked from commit f5364331eb4d33a6833e8ff7f2ac3bf1200bce21) --- CMakeLists.txt | 3 ++- bench/spbench/CMakeLists.txt | 4 ++-- test/CMakeLists.txt | 36 ++++++++++++++---------------------- 3 files changed, 18 insertions(+), 25 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index af95353f8..22456f713 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,6 +125,7 @@ else() endif() option(EIGEN_BUILD_BTL "Build benchmark suite" OFF) +option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF) # Disable pkgconfig only for native Windows builds if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows) @@ -547,7 +548,7 @@ if(EIGEN_BUILD_BTL) add_subdirectory(bench/btl EXCLUDE_FROM_ALL) endif() -if(NOT WIN32) +if(NOT WIN32 AND EIGEN_BUILD_SPBENCH) add_subdirectory(bench/spbench EXCLUDE_FROM_ALL) endif() diff --git a/bench/spbench/CMakeLists.txt b/bench/spbench/CMakeLists.txt index b1860049c..75c36b095 100644 --- a/bench/spbench/CMakeLists.txt +++ b/bench/spbench/CMakeLists.txt @@ -1,7 +1,7 @@ -set(BLAS_FOUND TRUE) -set(LAPACK_FOUND TRUE) +set(BLAS_FOUND EIGEN_BUILD_BLAS) +set(LAPACK_FOUND EIGEN_BUILD_LAPACK) set(BLAS_LIBRARIES eigen_blas_static) set(LAPACK_LIBRARIES eigen_lapack_static) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 5136f82aa..0900435d5 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -42,45 +42,53 @@ endif() set(SPARSE_LIBS " ") find_package(CHOLMOD) -if(CHOLMOD_FOUND) +if(CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK) add_definitions("-DEIGEN_CHOLMOD_SUPPORT") include_directories(${CHOLMOD_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ") + + ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ") endif() find_package(UMFPACK) -if(UMFPACK_FOUND) +if(UMFPACK_FOUND AND EIGEN_BUILD_BLAS) add_definitions("-DEIGEN_UMFPACK_SUPPORT") include_directories(${UMFPACK_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ") + + ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ") endif() find_package(KLU) -if(KLU_FOUND) +if(KLU_FOUND AND EIGEN_BUILD_BLAS) add_definitions("-DEIGEN_KLU_SUPPORT") include_directories(${KLU_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ") + + ei_add_test(klu_support "" "${KLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ") endif() find_package(SuperLU 4.0) -if(SuperLU_FOUND) +if(SuperLU_FOUND AND EIGEN_BUILD_BLAS) add_definitions("-DEIGEN_SUPERLU_SUPPORT") include_directories(${SUPERLU_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "SuperLU, ") + + ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "SuperLU, ") endif() @@ -124,7 +132,7 @@ else() endif() find_package(SPQR) -if(SPQR_FOUND AND CHOLMOD_FOUND AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) ) +if(SPQR_FOUND AND CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) ) add_definitions("-DEIGEN_SPQR_SUPPORT") include_directories(${SPQR_INCLUDES}) set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) @@ -310,22 +318,6 @@ if(QT4_FOUND) ei_add_test(qtvector "" "${QT_QTCORE_LIBRARY}") endif() -if(UMFPACK_FOUND) - ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") -endif() - -if(KLU_FOUND OR SuiteSparse_FOUND) - ei_add_test(klu_support "" "${KLU_ALL_LIBS}") -endif() - -if(SUPERLU_FOUND) - ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") -endif() - -if(CHOLMOD_FOUND) - ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}") -endif() - if(PARDISO_FOUND) ei_add_test(pardiso_support "" "${PARDISO_ALL_LIBS}") endif() @@ -334,7 +326,7 @@ if(PASTIX_FOUND AND (SCOTCH_FOUND OR METIS_FOUND)) ei_add_test(pastix_support "" "${PASTIX_ALL_LIBS}") endif() -if(SPQR_FOUND AND CHOLMOD_FOUND) +if(SPQR_FOUND AND CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK) ei_add_test(spqr_support "" "${SPQR_ALL_LIBS}") endif() -- GitLab From fd2817e3d6b4fd131d32241c28e4e9e0b4095b14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 2 Sep 2022 17:28:03 +0000 Subject: [PATCH 196/266] Add asserts for index-out-of-bounds in IndexedView. (cherry picked from commit f241a2c18a77cca64a71f6f4d25c1bac9b6dbb70) --- Eigen/src/Core/IndexedView.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 08476251d..1297d6a24 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -189,12 +189,16 @@ struct unary_evaluator, IndexBased> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -204,6 +208,8 @@ struct unary_evaluator, IndexBased> EIGEN_STATIC_ASSERT_LVALUE(XprType) Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -212,6 +218,8 @@ struct unary_evaluator, IndexBased> { Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -220,6 +228,8 @@ struct unary_evaluator, IndexBased> { Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } -- GitLab From 1000cf9fbc8a8c78d3a0ee4b5d0bbaacbaa302da Mon Sep 17 00:00:00 2001 From: Michael Palomas Date: Sun, 4 Sep 2022 17:50:43 +0000 Subject: [PATCH 197/266] fixed msvc compilation error in GeneralizedEigenSolver.h (cherry picked from commit 525f066671c401899ae458be6ecee473ec125f8b) --- Eigen/src/Eigenvalues/GeneralizedEigenSolver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index f9a7627d0..26324cee9 100644 --- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -177,7 +177,7 @@ template class GeneralizedEigenSolver * \sa eigenvalues() */ EigenvectorsType eigenvectors() const { - eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvectors") + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvectors"); eigen_assert(m_computeEigenvectors && "Eigenvectors for GeneralizedEigenSolver were not calculated"); return m_eivec; } -- GitLab From d641062a05a2f843cae8ba0eae6c2aa39dc404eb Mon Sep 17 00:00:00 2001 From: Gilles Aouizerate Date: Sun, 4 Sep 2022 21:20:21 +0000 Subject: [PATCH 198/266] fix typo in doc/TutorialSparse.dox (cherry picked from commit 6e83e906c21a49b44c4768c4aa590f7a21be4e96) --- doc/TutorialSparse.dox | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox index c69171ec5..4faba418d 100644 --- a/doc/TutorialSparse.dox +++ b/doc/TutorialSparse.dox @@ -44,8 +44,8 @@ This storage scheme is better explained on an example. The following matrix and one of its possible sparse, \b column \b major representation: - - + +
Values: 227_3514__1_178
InnerIndices: 12_02 4__2_ 14
Values: 227_35_14_1_178
InnerIndices: 12_02_4_2_ 14
-- GitLab From 8cc3ec8e479dc449252168182d20050b34b5d2b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Thu, 8 Sep 2022 19:39:36 +0000 Subject: [PATCH 199/266] Fix realloc for non-trivial types. (cherry picked from commit 311ba66f7c76e27489e13961d0523d0ff3185969) --- Eigen/src/Core/util/Memory.h | 114 +++++++++++++++++++++++------------ 1 file changed, 77 insertions(+), 37 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 79a763ccd..2bf508440 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -292,7 +292,7 @@ template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size) +template EIGEN_DEVICE_FUNC inline T* default_construct_elements_of_array(T *ptr, std::size_t size) { std::size_t i=0; EIGEN_TRY @@ -307,6 +307,46 @@ template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T * } } +/** \internal Copy-constructs the elements of an array. + * The \a size parameter tells on how many objects to copy. + */ +template EIGEN_DEVICE_FUNC inline T* copy_construct_elements_of_array(T *ptr, const T* src, std::size_t size) +{ + std::size_t i=0; + EIGEN_TRY + { + for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i)); + } + EIGEN_CATCH(...) + { + destruct_elements_of_array(ptr, i); + EIGEN_THROW; + } + return ptr; +} + +/** \internal Move-constructs the elements of an array. + * The \a size parameter tells on how many objects to move. + */ +template EIGEN_DEVICE_FUNC inline T* move_construct_elements_of_array(T *ptr, T* src, std::size_t size) +{ + std::size_t i=0; + EIGEN_TRY + { +#if EIGEN_HAS_RVALUE_REFERENCES + for (i = 0; i < size; ++i) ::new (ptr + i) T(std::move(*(src + i))); +#else + for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i)); +#endif + } + EIGEN_CATCH(...) + { + destruct_elements_of_array(ptr, i); + EIGEN_THROW; + } + return ptr; +} + /***************************************************************************** *** Implementation of aligned new/delete-like functions *** *****************************************************************************/ @@ -325,10 +365,10 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t s template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) { check_size_for_overflow(size); - T *result = reinterpret_cast(aligned_malloc(sizeof(T)*size)); + T *result = static_cast(aligned_malloc(sizeof(T)*size)); EIGEN_TRY { - return construct_elements_of_array(result, size); + return default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -341,10 +381,10 @@ template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size) { check_size_for_overflow(size); - T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + T *result = static_cast(conditional_aligned_malloc(sizeof(T)*size)); EIGEN_TRY { - return construct_elements_of_array(result, size); + return default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -376,21 +416,32 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned { check_size_for_overflow(new_size); check_size_for_overflow(old_size); - if(new_size < old_size) - destruct_elements_of_array(pts+new_size, old_size-new_size); - T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); - if(new_size > old_size) + + // If elements need to be explicitly initialized, we cannot simply realloc + // (or memcpy) the memory block - each element needs to be reconstructed. + // Otherwise, objects that contain internal pointers like mpfr or + // AnnoyingScalar can be pointing to the wrong thing. + T* result = static_cast(conditional_aligned_malloc(sizeof(T)*new_size)); + EIGEN_TRY { - EIGEN_TRY - { - construct_elements_of_array(result+old_size, new_size-old_size); - } - EIGEN_CATCH(...) - { - conditional_aligned_free(result); - EIGEN_THROW; + // Move-construct initial elements. + std::size_t copy_size = (std::min)(old_size, new_size); + move_construct_elements_of_array(result, pts, copy_size); + + // Default-construct remaining elements. + if (new_size > old_size) { + default_construct_elements_of_array(result + copy_size, new_size - old_size); } + + // Delete old elements. + conditional_aligned_delete(pts, old_size); + } + EIGEN_CATCH(...) + { + conditional_aligned_free(result); + EIGEN_THROW; } + return result; } @@ -400,12 +451,12 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned if(size==0) return 0; // short-cut. Also fixes Bug 884 check_size_for_overflow(size); - T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + T *result = static_cast(conditional_aligned_malloc(sizeof(T)*size)); if(NumTraits::RequireInitialization) { EIGEN_TRY { - construct_elements_of_array(result, size); + default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -418,24 +469,13 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned template inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size) { + if (NumTraits::RequireInitialization) { + return conditional_aligned_realloc_new(pts, new_size, old_size); + } + check_size_for_overflow(new_size); check_size_for_overflow(old_size); - if(NumTraits::RequireInitialization && (new_size < old_size)) - destruct_elements_of_array(pts+new_size, old_size-new_size); - T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); - if(NumTraits::RequireInitialization && (new_size > old_size)) - { - EIGEN_TRY - { - construct_elements_of_array(result+old_size, new_size-old_size); - } - EIGEN_CATCH(...) - { - conditional_aligned_free(result); - EIGEN_THROW; - } - } - return result; + return static_cast(conditional_aligned_realloc(static_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); } template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size) @@ -616,7 +656,7 @@ template class aligned_stack_memory_handler : noncopyable : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) - Eigen::internal::construct_elements_of_array(m_ptr, size); + Eigen::internal::default_construct_elements_of_array(m_ptr, size); } EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() @@ -667,7 +707,7 @@ struct local_nested_eval_wrapper m_deallocate(ptr==0) { if(NumTraits::RequireInitialization && object.data()) - Eigen::internal::construct_elements_of_array(object.data(), object.size()); + Eigen::internal::default_construct_elements_of_array(object.data(), object.size()); object = xpr; } -- GitLab From 28cd28072657e8367c0c611c8f155b17947bfbf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 4 Oct 2022 16:05:49 +0000 Subject: [PATCH 200/266] Fix 4x4 inverse when compiling with -Ofast. (cherry picked from commit 7d6a9925cc38842359750f3e06263e20b7635436) --- Eigen/src/LU/arch/InverseSize4.h | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index a232ffc0a..178bc3895 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -35,6 +35,13 @@ #ifndef EIGEN_INVERSE_SIZE_4_H #define EIGEN_INVERSE_SIZE_4_H +#ifdef EIGEN_COMP_GNUC +// These routines requires bit manipulation of the sign, which is not compatible +// with fastmath. +#pragma GCC push_options +#pragma GCC optimize ("no-fast-math") +#endif + namespace Eigen { namespace internal @@ -143,8 +150,8 @@ struct compute_inverse_size4(0x80000000u), numext::bit_cast(0x80000000u), 0.0f}; - const Packet4f p4f_sign_PNNP = ploadu(sign_mask); + EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f}; + const Packet4f p4f_sign_PNNP = pload(sign_mask); rd = pxor(rd, p4f_sign_PNNP); iA = pmul(iA, rd); iB = pmul(iB, rd); @@ -326,10 +333,10 @@ struct compute_inverse_size4(0x8000000000000000ull)}; - const double sign_mask2[2] = {numext::bit_cast(0x8000000000000000ull), 0.0}; - const Packet2d sign_PN = ploadu(sign_mask1); - const Packet2d sign_NP = ploadu(sign_mask2); + EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0}; + EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0}; + const Packet2d sign_PN = pload(sign_mask1); + const Packet2d sign_NP = pload(sign_mask2); d1 = pxor(rd, sign_PN); d2 = pxor(rd, sign_NP); @@ -348,4 +355,9 @@ struct compute_inverse_size4 Date: Mon, 10 Oct 2022 20:38:53 +0000 Subject: [PATCH 201/266] Guard GCC-specific pragmas with "#ifdef EIGEN_COMP_GNUC" (cherry picked from commit 5ceed0d57f14b0d9d62b8732f7f686b3aae56738) --- Eigen/src/LU/arch/InverseSize4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index 178bc3895..527019f6d 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -35,7 +35,7 @@ #ifndef EIGEN_INVERSE_SIZE_4_H #define EIGEN_INVERSE_SIZE_4_H -#ifdef EIGEN_COMP_GNUC +#if !EIGEN_COMP_LLVM // These routines requires bit manipulation of the sign, which is not compatible // with fastmath. #pragma GCC push_options @@ -356,7 +356,7 @@ struct compute_inverse_size4 Date: Tue, 11 Oct 2022 17:37:04 +0000 Subject: [PATCH 202/266] Eigen/Sparse: fix warnings -Wunused-but-set-variable (cherry picked from commit 7846c7387c1cb09cef7329630012ea1aefe64cf9) --- Eigen/src/SparseCore/TriangularSolver.h | 4 ++-- Eigen/src/SparseLU/SparseLU_heap_relax_snode.h | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h index f9c56ba79..7cb2c2665 100644 --- a/Eigen/src/SparseCore/TriangularSolver.h +++ b/Eigen/src/SparseCore/TriangularSolver.h @@ -270,11 +270,11 @@ struct sparse_solve_triangular_sparse_selector } - Index count = 0; +// Index count = 0; // FIXME compute a reference value to filter zeros for (typename AmbiVector::Iterator it(tempVector/*,1e-12*/); it; ++it) { - ++ count; +// ++ count; // std::cerr << "fill " << it.index() << ", " << col << "\n"; // std::cout << it.value() << " "; // FIXME use insertBack diff --git a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h index 6f75d500e..7aecbcad8 100644 --- a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +++ b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h @@ -75,8 +75,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe // Identify the relaxed supernodes by postorder traversal of the etree Index snode_start; // beginning of a snode StorageIndex k; - Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree - Index nsuper_et = 0; // Number of relaxed snodes in the original etree StorageIndex l; for (j = 0; j < n; ) { @@ -88,7 +86,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe parent = et(j); } // Found a supernode in postordered etree, j is the last column - ++nsuper_et_post; k = StorageIndex(n); for (Index i = snode_start; i <= j; ++i) k = (std::min)(k, inv_post(i)); @@ -97,7 +94,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe { // This is also a supernode in the original etree relax_end(k) = l; // Record last column - ++nsuper_et; } else { @@ -107,7 +103,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe if (descendants(i) == 0) { relax_end(l) = l; - ++nsuper_et; } } } -- GitLab From 15e23ab8497dc3b660bff62b21d634b13339c43f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 19 Oct 2022 18:15:29 +0000 Subject: [PATCH 203/266] Explicitly state that indices must be sorted. (cherry picked from commit bf48d463384e2fde45d0b0a008bd95351e475766) --- Eigen/src/SparseCore/SparseMap.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h index f99be3379..6247d79bd 100644 --- a/Eigen/src/SparseCore/SparseMap.h +++ b/Eigen/src/SparseCore/SparseMap.h @@ -237,6 +237,7 @@ class Map /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients, * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr. * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed. + * The inner indices must be sorted appropriately. * * This constructor is available only if \c SparseMatrixType is non-const. * -- GitLab From 4786edba2671a5d7d0b4d778951e0e571619bc18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 26 Oct 2022 22:50:57 +0000 Subject: [PATCH 204/266] Fix pragma check for disabling fastmath. (cherry picked from commit c27d1abe460c32a432e1f019be17f2c0f876ccac) --- Eigen/src/LU/arch/InverseSize4.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index 527019f6d..22ae38ac6 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -35,7 +35,7 @@ #ifndef EIGEN_INVERSE_SIZE_4_H #define EIGEN_INVERSE_SIZE_4_H -#if !EIGEN_COMP_LLVM +#if EIGEN_COMP_GNUC_STRICT // These routines requires bit manipulation of the sign, which is not compatible // with fastmath. #pragma GCC push_options @@ -356,7 +356,7 @@ struct compute_inverse_size4 Date: Tue, 8 Nov 2022 23:49:56 +0000 Subject: [PATCH 205/266] Fix typo in CholmodSupport (cherry picked from commit 7dc6db75d4e5cd89a4b708e4a022c66a27cda1bd) --- Eigen/CholmodSupport | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport index bed8924d3..1037bd55d 100644 --- a/Eigen/CholmodSupport +++ b/Eigen/CholmodSupport @@ -22,7 +22,7 @@ extern "C" { * This module provides an interface to the Cholmod library which is part of the suitesparse package. * It provides the two following main factorization classes: * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization. - * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). + * - class CholmodDecomposition: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). * * For the sake of completeness, this module also propose the two following classes: * - class CholmodSimplicialLLT -- GitLab From 55472050924b807e09ccf00ac62ece0679bf61c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 15 Nov 2022 18:07:23 +0000 Subject: [PATCH 206/266] Correct pnegate for floating-point zero. (cherry picked from commit 8588d8c74b42eedde578af01605ecc90189bc329) --- Eigen/src/Core/arch/AVX/PacketMath.h | 6 ++++-- Eigen/src/Core/arch/AVX512/PacketMath.h | 6 ++++-- Eigen/src/Core/util/Meta.h | 12 ------------ test/packetmath.cpp | 16 +++++++++++++--- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 7fc32fd71..2906217ba 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -285,11 +285,13 @@ template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { - return _mm256_sub_ps(_mm256_set1_ps(0.0),a); + const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return _mm256_xor_ps(a, mask); } template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) { - return _mm256_sub_pd(_mm256_set1_pd(0.0),a); + const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL)); + return _mm256_xor_pd(a, mask); } template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index bf55319fc..3110b2df9 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -296,11 +296,13 @@ EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, template <> EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { - return _mm512_sub_ps(_mm512_set1_ps(0.0), a); + const __m512i mask = _mm512_set1_epi32(0x80000000); + return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask)); } template <> EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) { - return _mm512_sub_pd(_mm512_set1_pd(0.0), a); + const __m512i mask = _mm512_set1_epi64(0x8000000000000000ULL); + return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask)); } template <> diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 3a0e5677e..3a9479101 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -189,21 +189,9 @@ template<> struct make_unsigned { typedef unsigned int type; } template<> struct make_unsigned { typedef unsigned int type; }; template<> struct make_unsigned { typedef unsigned long type; }; template<> struct make_unsigned { typedef unsigned long type; }; -#if EIGEN_COMP_MSVC -template<> struct make_unsigned { typedef unsigned __int64 type; }; -template<> struct make_unsigned { typedef unsigned __int64 type; }; -#endif - -// Some platforms define int64_t as `long long` even for C++03, where -// `long long` is not guaranteed by the standard. In this case we are missing -// the definition for make_unsigned. If we just define it, we run into issues -// where `long long` doesn't exist in some compilers for C++03. We therefore add -// the specialization for these platforms only. -#if EIGEN_OS_MAC || EIGEN_COMP_MINGW template<> struct make_unsigned { typedef unsigned long long type; }; template<> struct make_unsigned { typedef unsigned long long type; }; #endif -#endif template struct add_const { typedef const T type; }; template struct add_const { typedef T& type; }; diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 23aa33fc2..518b801b9 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -637,9 +637,19 @@ void packetmath_real() { const int PacketSize = internal::unpacket_traits::size; const int size = PacketSize * 4; - EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4]; - EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4]; - EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4]; + EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4] = {}; + EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4] = {}; + EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4] = {}; + + // Negate with -0. + if (PacketTraits::HasNegate) { + test::packet_helper h; + data1[0] = Scalar(-0); + h.store(data2, internal::pnegate(h.load(data1))); + typedef typename internal::make_unsigned::type>::type Bits; + Bits bits = numext::bit_cast(data2[0]); + VERIFY_IS_EQUAL(bits, static_cast(Bits(1)<<(sizeof(Scalar)*CHAR_BIT - 1))); + } for (int i = 0; i < size; ++i) { data1[i] = Scalar(internal::random(0, 1) * std::pow(10., internal::random(-6, 6))); -- GitLab From 26adb0e5aff4de53f048a63b382d5e1a1ad2e96b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 29 Nov 2022 19:37:03 +0000 Subject: [PATCH 207/266] Fix sparseLU solver when destination has a non-unit stride. (cherry picked from commit ab2b26fbc27cd03c1d75ea8c2cce22fdd2bcc45b) --- Eigen/src/SparseLU/SparseLU.h | 10 ++++------ Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h | 7 +++---- test/sparse_solver.h | 7 +++++++ 3 files changed, 14 insertions(+), 10 deletions(-) diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index f9143bc02..761b95c98 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -35,8 +35,8 @@ public: MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - SparseLUTransposeView() : m_sparseLU(NULL) {} - SparseLUTransposeView(const SparseLUTransposeView& view) { + SparseLUTransposeView() : APIBase(), m_sparseLU(NULL) {} + SparseLUTransposeView(const SparseLUTransposeView& view) : APIBase() { this->m_sparseLU = view.m_sparseLU; } void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;} @@ -833,7 +833,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator template void solveInPlace(MatrixBase &X) const { Index nrhs = X.cols(); - Index n = X.rows(); // Backward solve with U for (Index k = m_mapL.nsuper(); k >= 0; k--) { @@ -853,7 +852,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator { // FIXME: the following lines should use Block expressions and not Map! Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); U = A.template triangularView().solve(U); } @@ -876,7 +875,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator { using numext::conj; Index nrhs = X.cols(); - Index n = X.rows(); // Forward solve with U for (Index k = 0; k <= m_mapL.nsuper(); k++) { @@ -907,7 +905,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator else { Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); if(Conjugate) U = A.adjoint().template triangularView().solve(U); else diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h index 0be293d17..fd5e9fa51 100644 --- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h @@ -274,9 +274,8 @@ void MappedSuperNodalMatrix::solveInPlace( MatrixBase&X) co // Triangular solve Map, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); - U = A.template triangularView().solve(U); - + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); + U = A.template triangularView().solve(U); // Matrix-vector product new (&A) Map, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); work.topRows(nrow).noalias() = A * U; @@ -349,7 +348,7 @@ void MappedSuperNodalMatrix::solveTransposedInPlace( MatrixBase, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); if(Conjugate) U = U - A.adjoint() * work.topRows(nrow); else diff --git a/test/sparse_solver.h b/test/sparse_solver.h index 58927944b..6f95e2fa7 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -99,6 +99,13 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, VERIFY(solver.info() == Success && "solving failed when using Map"); VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!"); VERIFY(xm.isApprox(refX,test_precision())); + + // Test with a Map and non-unit stride. + Eigen::Matrix out(2*xm.rows(), 2*xm.cols()); + out.setZero(); + Eigen::Map > outm(out.data(), xm.rows(), xm.cols(), Stride(2 * xm.rows(), 2)); + outm = solver.solve(bm); + VERIFY(outm.isApprox(refX,test_precision())); } // if not too large, do some extra check: -- GitLab From 3eb0c8b69e3a3da47d92bf2c34f3903eef67a139 Mon Sep 17 00:00:00 2001 From: Alexandre Hoffmann Date: Tue, 29 Nov 2022 19:37:46 +0000 Subject: [PATCH 208/266] Changing BiCGSTAB parameters initialization so that it works with custom types (cherry picked from commit 23524ab6fcbf557b4ae1885d25d867727330c0c2) --- Eigen/src/IterativeLinearSolvers/BiCGSTAB.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 153acef65..1c9ade562 100644 --- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -49,9 +49,9 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, x.setZero(); return true; } - Scalar rho = 1; - Scalar alpha = 1; - Scalar w = 1; + Scalar rho (1); + Scalar alpha (1); + Scalar w (1); VectorType v = VectorType::Zero(n), p = VectorType::Zero(n); VectorType y(n), z(n); -- GitLab From f5593b4baa0274d70d086562941fa46876938dd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 29 Nov 2022 19:39:29 +0000 Subject: [PATCH 209/266] Fix reshape strides when input has non-zero inner stride. (cherry picked from commit 2260e11eb019161cc861ef2b832ce3b8a92efecd) --- Eigen/src/Core/Reshaped.h | 2 +- test/reshape.cpp | 19 +++++++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h index 52de73b6f..882314cfe 100644 --- a/Eigen/src/Core/Reshaped.h +++ b/Eigen/src/Core/Reshaped.h @@ -250,7 +250,7 @@ class ReshapedImpl_dense EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { - return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows(); + return (((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride(); } protected: diff --git a/test/reshape.cpp b/test/reshape.cpp index 7b16742a2..1f966ebae 100644 --- a/test/reshape.cpp +++ b/test/reshape.cpp @@ -193,6 +193,24 @@ void reshape4x4(MatType m) } } +template +void reshape_block(const BlockType& M) { + typename BlockType::PlainObject dense = M.eval(); + Index rows = M.size() / 2; + Index cols = M.size() / rows; + VERIFY_IS_EQUAL(dense.reshaped(rows, cols), M.reshaped(rows, cols)); + + for (Index i=0; i RowMatrixXi; @@ -213,4 +231,5 @@ EIGEN_DECLARE_TEST(reshape) CALL_SUBTEST(reshape4x4(rmx)); CALL_SUBTEST(reshape4x4(rm4)); + CALL_SUBTEST(reshape_block(rm4.col(1))); } -- GitLab From b26ada1e032c34da899b751bf1b167891c521185 Mon Sep 17 00:00:00 2001 From: Martin Burchell Date: Tue, 10 Jan 2023 21:15:28 +0000 Subject: [PATCH 210/266] Fix error: unused parameter 'tmp' [-Werror,-Wunused-parameter] on clang/32-bit arm (cherry picked from commit c54785b071e6297c062883cf43f323525ff0e6fb) --- Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h index 3481f337e..0963b0f1f 100644 --- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -24,7 +24,7 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, - Packet4f& c, Packet4f& tmp, + Packet4f& c, Packet4f&, const LaneIdType&) const { acc(a, b, c); } -- GitLab From 90dce8dfa335c4cb7ee63ae3795c8fd2b390a935 Mon Sep 17 00:00:00 2001 From: Jeremy Nimmer Date: Wed, 1 Feb 2023 00:40:45 +0000 Subject: [PATCH 211/266] Fix undefined behavior in Block access (cherry picked from commit a1cdcdb038cda474aefb900171222254599e9dd8) --- Eigen/src/Core/Block.h | 23 +++++++++++++++++++---- Eigen/src/Core/StlIterators.h | 1 + 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index d0b95d50b..5932a9093 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -334,6 +334,17 @@ class BlockImpl_dense enum { XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0 }; + + /** \internal Returns base+offset (unless base is null, in which case returns null). + * Adding an offset to nullptr is undefined behavior, so we must avoid it. + */ + template + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE + static Scalar* add_to_nullable_pointer(Scalar* base, Index offset) + { + return base != NULL ? base+offset : NULL; + } + public: typedef MapBase Base; @@ -344,8 +355,9 @@ class BlockImpl_dense */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index i) - : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) - || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), + : Base(add_to_nullable_pointer(xpr.data(), + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) + || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride())), BlockRows==1 ? 1 : xpr.rows(), BlockCols==1 ? 1 : xpr.cols()), m_xpr(xpr), @@ -359,7 +371,8 @@ class BlockImpl_dense */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) - : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), + : Base(add_to_nullable_pointer(xpr.data(), + xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol))), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); @@ -371,7 +384,9 @@ class BlockImpl_dense BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) - : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), + : Base(add_to_nullable_pointer(xpr.data(), + xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), + blockRows, blockCols), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h index 09041db1d..5db3f605b 100644 --- a/Eigen/src/Core/StlIterators.h +++ b/Eigen/src/Core/StlIterators.h @@ -196,6 +196,7 @@ public: pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {} pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride()) { + eigen_assert(xpr.data() != NULL || index == 0 || m_incr.value() == 0); m_ptr = xpr.data() + index * m_incr.value(); } -- GitLab From 879854382cffc2a46a89570c13fcdbc4d0d3663d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 8 Feb 2023 21:46:37 +0000 Subject: [PATCH 212/266] Fix MSVC arm build. (cherry picked from commit 0a5392d6061134a4a32d0025fa154f830b83d606) --- Eigen/src/Core/arch/NEON/Complex.h | 4 +- Eigen/src/Core/arch/NEON/PacketMath.h | 90 ++++--- Eigen/src/Core/arch/NEON/TypeCasting.h | 339 +++++++++++++------------ 3 files changed, 234 insertions(+), 199 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index f40af7f87..a58f13ca8 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -129,12 +129,12 @@ template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Pa template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) { - const Packet2ui b = vreinterpret_u32_f32(a.v); + const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v)); return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR()))); } template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - const Packet4ui b = vreinterpretq_u32_f32(a.v); + const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v)); return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6996cc8d3..6c2dbe458 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -57,6 +57,16 @@ typedef eigen_packet_wrapper Packet4ui; typedef eigen_packet_wrapper Packet2l; typedef eigen_packet_wrapper Packet2ul; +EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { + float from[4] = {a, b, c, d}; + return vld1q_f32(from); +} + +EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { + float from[2] = {a, b}; + return vld1_f32(from); +} + #else typedef float32x2_t Packet2f; @@ -78,11 +88,14 @@ typedef uint32x4_t Packet4ui; typedef int64x2_t Packet2l; typedef uint64x2_t Packet2ul; +EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return {a, b, c, d}; } +EIGEN_ALWAYS_INLINE Packet4f make_packet2f(float a, float b) { return {a, b}; } + #endif // EIGEN_COMP_MSVC_STRICT EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ const float* a = reinterpret_cast(&m); - Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))); return res; } @@ -95,7 +108,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); - Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); return res; } @@ -104,7 +117,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); - Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); return res; } @@ -146,7 +159,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC // __builtin_prefetch tends to do nothing on ARM64 compilers because the // prefetch instructions there are too detailed for __builtin_prefetch to map // meaningfully to them. @@ -862,12 +875,12 @@ template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, con template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b); template<> EIGEN_STRONG_INLINE Packet2f paddsub(const Packet2f& a, const Packet2f & b) { - Packet2f mask = {numext::bit_cast(0x80000000u), 0.0f}; + Packet2f mask = make_packet2f(numext::bit_cast(0x80000000u), 0.0f); return padd(a, pxor(mask, b)); } template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { - Packet4f mask = {numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f}; + Packet4f mask = make_packet4f(numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f); return padd(a, pxor(mask, b)); } @@ -2499,7 +2512,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(co template<> EIGEN_STRONG_INLINE float predux_mul(const Packet2f& a) { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); } template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } +{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet4c& a) { int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a)); @@ -2513,7 +2526,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet8c& a) return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4); } template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) -{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } +{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet4uc& a) { uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a)); @@ -2527,7 +2540,7 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet8uc& a) return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4); } template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) -{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } +{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet4s& a) { const int16x4_t prod = vmul_s16(a, vrev32_s16(a)); @@ -2563,11 +2576,11 @@ template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet8us& a template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet2i& a) { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); } template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) -{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } +{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet2ui& a) { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); } template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) -{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } +{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } template<> EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); } template<> EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a) @@ -3388,7 +3401,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) { // See the scalar implemention in BFloat16.h for a comprehensible explanation // of this fast rounding algorithm - Packet4ui input = reinterpret_cast(p); + Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p)); // lsb = (input >> 16) & 1 Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1)); @@ -3413,7 +3426,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) { - return reinterpret_cast(vshlq_n_u32(vmovl_u16(p), 16)); + return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16))); } EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { @@ -3421,21 +3434,21 @@ EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { } template<> EIGEN_STRONG_INLINE Packet4bf pset1(const bfloat16& from) { - return pset1(from.value); + return Packet4bf(pset1(from.value)); } template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { - return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(from))); + return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(Packet4us(from)))); } template<> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) { - return pload(reinterpret_cast(from)); + return Packet4bf(pload(reinterpret_cast(from))); } template<> EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) { - return ploadu(reinterpret_cast(from)); + return Packet4bf(ploadu(reinterpret_cast(from))); } template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) @@ -3450,7 +3463,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet template<> EIGEN_STRONG_INLINE Packet4bf ploaddup(const bfloat16* from) { - return ploaddup(reinterpret_cast(from)); + return Packet4bf(ploaddup(reinterpret_cast(from))); } template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) { @@ -3497,25 +3510,25 @@ template<> EIGEN_STRONG_INLINE Packet4bf plset(const bfloat16& a) } template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) { - return por(a, b); + return Packet4bf(por(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) { - return pxor(a, b); + return Packet4bf(pxor(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) { - return pand(a, b); + return Packet4bf(pand(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) { - return pandnot(a, b); + return Packet4bf(pandnot(Packet4us(a), Packet4us(b))); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) { - return pselect(mask, a, b); + return Packet4bf(pselect(Packet4us(mask), Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf print(const Packet4bf& a) @@ -3554,13 +3567,13 @@ template<> EIGEN_STRONG_INLINE Packet4bf pdiv(const Packet4bf& a, con template<> EIGEN_STRONG_INLINE Packet4bf pgather(const bfloat16* from, Index stride) { - return pgather(reinterpret_cast(from), stride); + return Packet4bf(pgather(reinterpret_cast(from), stride)); } template<> EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet4bf& from, Index stride) { - pscatter(reinterpret_cast(to), from, stride); + pscatter(reinterpret_cast(to), Packet4us(from), stride); } template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet4bf& a) @@ -3585,7 +3598,7 @@ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet4bf& a template<> EIGEN_STRONG_INLINE Packet4bf preverse(const Packet4bf& a) { - return preverse(a); + return Packet4bf(preverse(Packet4us(a))); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) @@ -3620,7 +3633,7 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le(const Packet4bf& a, template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) { - return pxor(a, pset1(static_cast(0x8000))); + return Packet4bf(pxor(Packet4us(a), pset1(static_cast(0x8000)))); } //---------- double ---------- @@ -3638,17 +3651,34 @@ template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +#if EIGEN_COMP_GNUC // Bug 907: workaround missing declarations of the following two functions in the ADK // Defining these functions as templates ensures that if these intrinsics are // already defined in arm_neon.h, then our workaround doesn't cause a conflict // and has lower priority in overload resolution. +// This doesn't work with MSVC though, since the function names are macros. template uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; } template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; } +#endif +#if EIGEN_COMP_MSVC_STRICT +typedef eigen_packet_wrapper Packet2d; +typedef eigen_packet_wrapper Packet1d; + +EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { + double from[2] = {a, b}; + return vld1q_f64(from); +} + +#else typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; +EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return {a, b}; } +#endif + + // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) // Currently used in LU/arch/InverseSize4.h to enable a shared implementation // for fast inversion of matrices of size 4. @@ -3656,7 +3686,7 @@ EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int m { const double* a = reinterpret_cast(&m); const double* b = reinterpret_cast(&n); - Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))}; + Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1))); return res; } @@ -3747,7 +3777,7 @@ template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b){ - const Packet2d mask = {numext::bit_cast(0x8000000000000000ull),0.0}; + const Packet2d mask = make_packet2d(numext::bit_cast(0x8000000000000000ull), 0.0); return padd(a, pxor(mask, b)); } @@ -3862,7 +3892,7 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } #else template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) -{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); } #endif // min diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h index 54f97336e..1bc51b0b1 100644 --- a/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -15,6 +15,113 @@ namespace Eigen { namespace internal { +//============================================================================== +// preinterpret +//============================================================================== +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { + return Packet2f(vreinterpret_f32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { + return Packet2f(vreinterpret_f32_u32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return Packet4f(vreinterpretq_f32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { + return Packet4f(vreinterpretq_f32_u32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { + return Packet8c(preinterpret(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { + return Packet16c(vreinterpretq_s8_u8(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { + return Packet8uc(vreinterpret_u8_s8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { + return Packet16uc(vreinterpretq_u8_s8(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { + return Packet4s(vreinterpret_s16_u16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { + return Packet8s(vreinterpretq_s16_u16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { + return Packet4us(vreinterpret_u16_s16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { + return Packet8us(vreinterpretq_u16_s16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { + return Packet2i(vreinterpret_s32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { + return Packet2i(vreinterpret_s32_u32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return Packet4i(vreinterpretq_s32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { + return Packet4i(vreinterpretq_s32_u32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { + return Packet2ui(vreinterpret_u32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { + return Packet2ui(vreinterpret_u32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { + return Packet4ui(vreinterpretq_u32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { + return Packet4ui(vreinterpretq_u32_s32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { + return Packet2l(vreinterpretq_s64_u64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { + return Packet2ul(vreinterpretq_u64_s64(a)); +} + //============================================================================== // pcast, SrcType = float //============================================================================== @@ -188,7 +295,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16c& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -212,11 +319,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16c& a) { - return vreinterpretq_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8c& a) { - return vreinterpret_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -240,11 +347,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8us pcast(const Packet16c& a) { - return vreinterpretq_u16_s16(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet4us pcast(const Packet8c& a) { - return vreinterpret_u16_s16(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -270,11 +377,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16c& a) { - return vreinterpretq_u8_s8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8c& a) { - return vreinterpret_u8_s8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4c& a) { @@ -315,7 +422,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet16uc& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -339,11 +446,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet16uc& a) { - return vreinterpretq_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet8uc& a) { - return vreinterpret_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -367,11 +474,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet16uc& a) { - return vreinterpretq_s16_u16(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet8uc& a) { - return vreinterpret_s16_u16(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -397,11 +504,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet16uc& a) { - return vreinterpretq_s8_u8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8uc& a) { - return vreinterpret_s8_u8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4uc& a) { @@ -442,7 +549,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8s& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -466,11 +573,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8s& a) { - return vreinterpretq_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4s& a) { - return vreinterpret_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -492,11 +599,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8s& a) { - return vreinterpretq_u16_s16(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4s& a) { - return vreinterpret_u16_s16(a); + return preinterpret(a); } template <> @@ -559,7 +666,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet8us& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -583,11 +690,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet8us& a) { - return vreinterpretq_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet4us& a) { - return vreinterpret_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -609,11 +716,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8us& a) { - return vreinterpretq_s16_u16(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4us& a) { - return vreinterpret_s16_u16(a); + return preinterpret(a); } template <> @@ -635,11 +742,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet8us& a, const Packet8us& b) { - return vreinterpretq_s8_u8(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet4us& a, const Packet4us& b) { - return vreinterpret_s8_u8(pcast(a, b)); + return preinterpret(pcast(a, b)); } //============================================================================== @@ -674,7 +781,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4i& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -696,11 +803,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4i& a) { - return vreinterpretq_u32_s32(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2i& a) { - return vreinterpret_u32_s32(a); + return preinterpret(a); } template <> @@ -799,7 +906,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet4ui& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -821,11 +928,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4ui& a) { - return vreinterpretq_s32_u32(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ui& a) { - return vreinterpret_s32_u32(a); + return preinterpret(a); } template <> @@ -847,11 +954,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet4ui& a, const Packet4ui& b) { - return vreinterpretq_s16_u16(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet2ui& a, const Packet2ui& b) { - return vreinterpret_s16_u16(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> @@ -880,12 +987,12 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, const Packet4ui& d) { - return vreinterpretq_s8_u8(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, const Packet2ui& d) { - return vreinterpret_s8_u8(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } //============================================================================== @@ -915,7 +1022,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2l& a) { - return vreinterpretq_u64_s64(a); + return preinterpret(a); } template <> @@ -1013,7 +1120,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ul& a) { - return vreinterpretq_s64_u64(a); + return preinterpret(a); } template <> @@ -1031,7 +1138,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet2ul& a, const Packet2ul& b) { - return vreinterpretq_s32_u32(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> @@ -1053,7 +1160,7 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, const Packet2ul& d) { - return vreinterpretq_s16_u16(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } template <> @@ -1077,122 +1184,40 @@ template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, const Packet2ul& g, const Packet2ul& h) { - return vreinterpretq_s8_u8(pcast(a, b, c, d, e, f, g, h)); + return preinterpret(pcast(a, b, c, d, e, f, g, h)); } +#if EIGEN_ARCH_ARM64 + //============================================================================== -// preinterpret +// pcast/preinterpret, Double //============================================================================== -template <> -EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { - return vreinterpret_f32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { - return vreinterpret_f32_u32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { - return vreinterpretq_f32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { - return vreinterpretq_f32_u32(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { - return static_cast(a); -} -template <> -EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { - return vreinterpret_s8_u8(a); -} -template <> -EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { - return vreinterpretq_s8_u8(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { - return static_cast(a); -} -template <> -EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { - return vreinterpret_u8_s8(a); -} -template <> -EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { - return vreinterpretq_u8_s8(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { - return vreinterpret_s16_u16(a); -} -template <> -EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { - return vreinterpretq_s16_u16(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { - return vreinterpret_u16_s16(a); -} -template <> -EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { - return vreinterpretq_u16_s16(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { - return vreinterpret_s32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { - return vreinterpret_s32_u32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { - return vreinterpretq_s32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { - return vreinterpretq_s32_u32(a); -} template <> -EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { - return vreinterpret_u32_f32(a); +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { + return Packet2d(vreinterpretq_f64_s64(a)); } template <> -EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { - return vreinterpret_u32_s32(a); +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { + return Packet2d(vreinterpretq_f64_u64(a)); } template <> -EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { - return vreinterpretq_u32_f32(a); +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { + return Packet2l(vreinterpretq_s64_f64(a)); } template <> -EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { - return vreinterpretq_u32_s32(a); +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { + return Packet2ul(vreinterpretq_u64_f64(a)); } - template <> -EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { - return vreinterpretq_s64_u64(a); +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return Packet2d(vreinterpretq_f64_s32(a)); } template <> -EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { - return vreinterpretq_u64_s64(a); +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return Packet4i(vreinterpretq_s32_f64(a)); } -#if EIGEN_ARCH_ARM64 - -//============================================================================== -// pcast/preinterpret, Double -//============================================================================== - template <> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; @@ -1314,7 +1339,9 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet16c& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_s8(a))); + // MSVC defines most intrinsics as macros, so we need to do this in two lines for portability. + Packet2f tmp = pcast(vget_low_s8(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1324,7 +1351,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet16uc& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_u8(a))); + Packet2f tmp = pcast(vget_low_u8(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1334,7 +1362,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet8s& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_s16(a))); + Packet2f tmp = pcast(vget_low_s16(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1344,7 +1373,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet8us& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_u16(a))); + Packet2f tmp = pcast(vget_low_s16(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1385,31 +1415,6 @@ EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { return vcvtq_f64_u64(a); } -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { - return vreinterpretq_f64_s64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { - return vreinterpretq_f64_u64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { - return vreinterpretq_s64_f64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { - return vreinterpretq_u64_f64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { - return vreinterpretq_f64_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { - return vreinterpretq_s32_f64(a); -} - #endif // EIGEN_ARCH_ARM64 } // end namespace internal -- GitLab From a659b5dbb28bf0d210133f7abd9c74199efafcd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 14 Feb 2023 16:52:07 +0000 Subject: [PATCH 213/266] Fix NEON make_packet2f. (cherry picked from commit 2dfbf1b251e7a32c140f36fc865b154b8a725bdd) --- Eigen/src/Core/arch/NEON/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 6c2dbe458..dd1234b9f 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -89,7 +89,7 @@ typedef int64x2_t Packet2l; typedef uint64x2_t Packet2ul; EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return {a, b, c, d}; } -EIGEN_ALWAYS_INLINE Packet4f make_packet2f(float a, float b) { return {a, b}; } +EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { return {a, b}; } #endif // EIGEN_COMP_MSVC_STRICT -- GitLab From 2dfdaa2abfe5d83b9d94a188a7c227d29469644c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 14 Feb 2023 21:45:25 +0000 Subject: [PATCH 214/266] More NEON packetmath fixes. (cherry picked from commit 384269937f707669fb1ab65bee7e9bfca2c2dfa1) --- Eigen/src/Core/arch/NEON/PacketMath.h | 35 +++++++++++--------------- Eigen/src/Core/arch/NEON/TypeCasting.h | 4 +-- 2 files changed, 16 insertions(+), 23 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index dd1234b9f..e57d9c91c 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -57,16 +57,6 @@ typedef eigen_packet_wrapper Packet4ui; typedef eigen_packet_wrapper Packet2l; typedef eigen_packet_wrapper Packet2ul; -EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { - float from[4] = {a, b, c, d}; - return vld1q_f32(from); -} - -EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { - float from[2] = {a, b}; - return vld1_f32(from); -} - #else typedef float32x2_t Packet2f; @@ -88,11 +78,18 @@ typedef uint32x4_t Packet4ui; typedef int64x2_t Packet2l; typedef uint64x2_t Packet2ul; -EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return {a, b, c, d}; } -EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { return {a, b}; } - #endif // EIGEN_COMP_MSVC_STRICT +EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { + float from[4] = {a, b, c, d}; + return vld1q_f32(from); +} + +EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { + float from[2] = {a, b}; + return vld1_f32(from); +} + EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ const float* a = reinterpret_cast(&m); Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))); @@ -3665,19 +3662,15 @@ template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2 #if EIGEN_COMP_MSVC_STRICT typedef eigen_packet_wrapper Packet2d; typedef eigen_packet_wrapper Packet1d; - -EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { - double from[2] = {a, b}; - return vld1q_f64(from); -} - #else typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; - -EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return {a, b}; } #endif +EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { + double from[2] = {a, b}; + return vld1q_f64(from); +} // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) // Currently used in LU/arch/InverseSize4.h to enable a shared implementation diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h index 1bc51b0b1..c546466a1 100644 --- a/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -41,7 +41,7 @@ EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& } template <> EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { - return Packet8c(preinterpret(a)); + return Packet8c(vreinterpret_s8_u8(a)); } template <> EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { @@ -1373,7 +1373,7 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet8us& a) { // Discard all but first two values. - Packet2f tmp = pcast(vget_low_s16(a)); + Packet2f tmp = pcast(vget_low_u16(a)); return vcvt_f64_f32(tmp); } -- GitLab From dae8c6d7ad58f3816fb2d1da2873d1c84ccdb583 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Thu, 16 Feb 2023 19:47:00 +0000 Subject: [PATCH 215/266] Guard complex sqrt on old MSVC compilers. (cherry picked from commit a16fb889dd5890b2a0788af10568f19155e6b262) --- Eigen/src/Core/arch/AVX512/Complex.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index bdedb7b6c..0167d050e 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -37,7 +37,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, - HasSqrt = 1, + HasSqrt = EIGEN_HAS_AVX512_MATH, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -194,7 +194,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, - HasSqrt = 1, + HasSqrt = EIGEN_HAS_AVX512_MATH, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -406,6 +406,8 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0] } +#if EIGEN_HAS_AVX512_MATH + template<> EIGEN_STRONG_INLINE Packet4cd psqrt(const Packet4cd& a) { return psqrt_complex(a); } @@ -414,6 +416,8 @@ template<> EIGEN_STRONG_INLINE Packet8cf psqrt(const Packet8cf& a) { return psqrt_complex(a); } +#endif + } // end namespace internal } // end namespace Eigen -- GitLab From 8f1b6198c26d8c72bdb7762ac56611eb06354f30 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Thu, 16 Feb 2023 23:35:42 +0000 Subject: [PATCH 216/266] Fix epsilon and dummy_precision values in long double for double doubles. Prevented some algorithms from converging on PPC. (cherry picked from commit 54459214a1b9c67df04bc529474fca1ec9f4c84f) --- Eigen/src/Core/NumTraits.h | 14 ++++++++++++-- .../Eigen/src/MatrixFunctions/MatrixPower.h | 2 +- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 72eac5a93..8e81237ac 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -248,8 +248,18 @@ template<> struct NumTraits : GenericNumTraits template<> struct NumTraits : GenericNumTraits { - EIGEN_CONSTEXPR - static inline long double dummy_precision() { return 1e-15l; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline long double dummy_precision() { return static_cast(1e-15l); } + +#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106) + // PowerPC double double causes issues with some values + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline long double epsilon() + { + // 2^(-(__LDBL_MANT_DIG__)+1) + return static_cast(2.4651903288156618919116517665087e-32l); + } +#endif }; template struct NumTraits > diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index d7672d7c9..ce92f5bfd 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -281,7 +281,7 @@ inline int MatrixPowerAtomic::getPadeDegree(long double normIminusT) #endif int degree = 3; for (; degree <= maxPadeDegree; ++degree) - if (normIminusT <= maxNormForPade[degree - 3]) + if (normIminusT <= static_cast(maxNormForPade[degree - 3])) break; return degree; } -- GitLab From 2ce5dc428fa590ca2cfe06399ccd0076cecd695e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 24 Feb 2023 21:49:59 +0000 Subject: [PATCH 217/266] Guard use of long double on GPU device. (cherry picked from commit bc5cdc7a675621ce54c4a9277d041d9dd7aae757) --- Eigen/src/Core/MathFunctions.h | 15 ++++++++++++++- Eigen/src/Core/NumTraits.h | 3 +++ Eigen/src/Core/util/Meta.h | 3 +++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 61b78f4f2..bff595300 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -1032,11 +1032,15 @@ template EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) } //MSVC defines a _isnan builtin function, but for double only +#ifndef EIGEN_GPU_COMPILE_PHASE EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; } +#endif EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x)!=0; } EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x)!=0; } +#ifndef EIGEN_GPU_COMPILE_PHASE EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); } +#endif EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) { return isinf_msvc_helper(x); } EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) { return isinf_msvc_helper(x); } @@ -1050,12 +1054,16 @@ EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) { return isinf_ms #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only"))) #endif +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); } +#endif template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) { return __builtin_isnan(x); } template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) { return __builtin_isnan(x); } template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) { return __builtin_isinf(x); } template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) { return __builtin_isinf(x); } +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); } +#endif #undef EIGEN_TMP_NOOPT_ATTRIB @@ -1112,6 +1120,8 @@ EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) { return fmin(x, y); } + +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) @@ -1123,6 +1133,7 @@ EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) return fminl(x, y); #endif } +#endif template EIGEN_DEVICE_FUNC @@ -1142,6 +1153,7 @@ EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) { return fmax(x, y); } +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) @@ -1154,6 +1166,7 @@ EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) #endif } #endif +#endif #if defined(SYCL_DEVICE_ONLY) @@ -1310,8 +1323,8 @@ EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y) return fabs(x - y); } -#if !defined(EIGEN_GPUCC) // HIP and CUDA do not support long double. +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) { diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index 8e81237ac..e23265138 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -245,6 +245,8 @@ template<> struct NumTraits : GenericNumTraits static inline double dummy_precision() { return 1e-12; } }; +// GPU devices treat `long double` as `double`. +#ifndef EIGEN_GPU_COMPILE_PHASE template<> struct NumTraits : GenericNumTraits { @@ -261,6 +263,7 @@ template<> struct NumTraits } #endif }; +#endif template struct NumTraits > : GenericNumTraits > diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 3a9479101..a7966b867 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -133,7 +133,10 @@ template struct remove_all { typedef typename remove_all< template struct is_arithmetic { enum { value = false }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; +// GPU devices treat `long double` as `double`. +#ifndef EIGEN_GPU_COMPILE_PHASE template<> struct is_arithmetic { enum { value = true }; }; +#endif template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; -- GitLab From 99473f255b939bf893d653eb9492037c53f7d65f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 10 Mar 2023 22:36:57 +0000 Subject: [PATCH 218/266] Fix failing MSVC tests due to compiler bugs. (cherry picked from commit 394aabb0a3976d95a5c6f286d49e43bb49558cc2) --- Eigen/src/Core/arch/AVX/PacketMath.h | 16 ++++++++++++++-- Eigen/src/Core/arch/AVX512/PacketMath.h | 11 +++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 2906217ba..5be14ba11 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -630,11 +630,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { +#ifdef EIGEN_VECTORIZE_AVX512 + __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF); + EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from)); +#else Packet8i mask = _mm256_set1_epi8(static_cast(umask)); - const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + const Packet8i bit_mask = _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe); mask = por(mask, bit_mask); mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); - EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from); +#if EIGEN_COMP_MSVC + // MSVC sometimes seems to use a bogus mask with maskstore. + const __m256i ifrom = _mm256_castps_si256(from); + EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0), reinterpret_cast(to)); + EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1), reinterpret_cast(to + 4)); +#else + EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from); +#endif +#endif } // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index 3110b2df9..4ab100cec 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -296,12 +296,19 @@ EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, template <> EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { - const __m512i mask = _mm512_set1_epi32(0x80000000); + // NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results. + // The intel docs give it a relatively high latency as well, so we're probably + // better off with using _mm512_set_epi32 directly anyways. + const __m512i mask = _mm512_set_epi32(0x80000000,0x80000000,0x80000000,0x80000000, + 0x80000000,0x80000000,0x80000000,0x80000000, + 0x80000000,0x80000000,0x80000000,0x80000000, + 0x80000000,0x80000000,0x80000000,0x80000000); return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask)); } template <> EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) { - const __m512i mask = _mm512_set1_epi64(0x8000000000000000ULL); + const __m512i mask = _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, + 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL); return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask)); } -- GitLab From 63e8b31c9424da1cdc0bdb3bde9c27041c8369f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Thu, 16 Mar 2023 22:47:38 +0000 Subject: [PATCH 219/266] Fix parsing of command-line arguments when already specified as a cmake list. (cherry picked from commit 555cec17edc2cae91b3310ef8915acbeb3951cab) --- CMakeLists.txt | 18 ++++++++++++++++++ cmake/EigenTesting.cmake | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 22456f713..faaaa265f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,20 @@ else() ei_add_cxx_compiler_flag("-std=c++03") endif() +function(ei_maybe_separate_arguments variable mode args) + # Use separate_arguments if the input is a single string containing a space. + # Otherwise, if it is already a list or doesn't have a space, just propagate + # the original value. This is to better support multi-argument lists. + list(LENGTH args list_length) + if (${list_length} EQUAL 1) + string(FIND "${args}" " " has_space) + if (${has_space} GREATER -1) + separate_arguments(args ${mode} "${args}") + endif() + endif() + set(${variable} ${args} PARENT_SCOPE) +endfunction(ei_maybe_separate_arguments) + # Determine if we should build shared libraries on this platform. get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS) @@ -101,6 +115,10 @@ find_package(StandardMathLibrary) set(EIGEN_TEST_CUSTOM_LINKER_FLAGS "" CACHE STRING "Additional linker flags when linking unit tests.") set(EIGEN_TEST_CUSTOM_CXX_FLAGS "" CACHE STRING "Additional compiler flags when compiling unit tests.") +# Convert space-separated arguments into CMake lists for downstream consumption. +ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_LINKER_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_LINKER_FLAGS}") +ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_CXX_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_CXX_FLAGS}") + set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "") if(NOT STANDARD_MATH_LIBRARY_FOUND) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index eb8457db6..bc47da86d 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -83,7 +83,7 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_TEST_CUSTOM_CXX_FLAGS) - ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}") + ei_add_target_property(${targetname} COMPILE_FLAGS ${EIGEN_TEST_CUSTOM_CXX_FLAGS}) endif() if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) -- GitLab From 34d0d83278eab61b63f5f4f55cbde6ec8a3b781e Mon Sep 17 00:00:00 2001 From: Jonas Schulze Date: Thu, 16 Mar 2023 23:11:43 +0000 Subject: [PATCH 220/266] Fix some typos (cherry picked from commit 81cb6a51d0358151a7174481e140074401ca4afa) --- Eigen/src/Core/util/Constants.h | 2 +- doc/CustomizingEigen_Plugins.dox | 2 +- doc/FunctionsTakingEigenTypes.dox | 2 +- doc/SparseQuickReference.dox | 2 +- doc/examples/matrixfree_cg.cpp | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index 35dcaa7b3..0667b1c44 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -134,7 +134,7 @@ const unsigned int LinearAccessBit = 0x10; * Means the expression has a coeffRef() method, i.e. is writable as its individual coefficients are directly addressable. * This rules out read-only expressions. * - * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but note + * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but not * the other: * \li writable expressions that don't have a very simple memory layout as a strided array, have LvalueBit but not DirectAccessBit * \li Map-to-const expressions, for example Map, have DirectAccessBit but not LvalueBit diff --git a/doc/CustomizingEigen_Plugins.dox b/doc/CustomizingEigen_Plugins.dox index d88f2409b..9ab0200ff 100644 --- a/doc/CustomizingEigen_Plugins.dox +++ b/doc/CustomizingEigen_Plugins.dox @@ -59,7 +59,7 @@ operator+(const Scalar& scalar, const MatrixBase& mat) { return CwiseBinaryOp, const ConstantReturnType, Derived>(Constant(rows(),cols(),scalar), mat.derived()); } \endcode -Then one can the following declaration in the config.h or whatever prerequisites header file of his project: +Then one can add the following declaration in the config.h or whatever prerequisites header file of his project: \code #define EIGEN_MATRIXBASE_PLUGIN "MatrixBaseAddons.h" \endcode diff --git a/doc/FunctionsTakingEigenTypes.dox b/doc/FunctionsTakingEigenTypes.dox index 6b4e49214..3e745462c 100644 --- a/doc/FunctionsTakingEigenTypes.dox +++ b/doc/FunctionsTakingEigenTypes.dox @@ -126,7 +126,7 @@ and contrary to what one might think at first, this implementation is fine unles MatrixXf x,y,z; MatrixXf C = cov(x,y+z); \endcode -In this special case, the example is fine and will be working because both parameters are declared as \e const references. The compiler creates a temporary and evaluates the expression x+z into this temporary. Once the function is processed, the temporary is released and the result is assigned to C. +In this special case, the example is fine and will be working because both parameters are declared as \e const references. The compiler creates a temporary and evaluates the expression y+z into this temporary. Once the function is processed, the temporary is released and the result is assigned to C. \b Note: Functions taking \e const references to Matrix (or Array) can process expressions at the cost of temporaries. diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox index 9779f3f9c..14a589131 100644 --- a/doc/SparseQuickReference.dox +++ b/doc/SparseQuickReference.dox @@ -153,7 +153,7 @@ It is easy to perform arithmetic operations on sparse matrices provided that the \code perm.indices(); // Reference to the vector of indices sm1.twistedBy(perm); // Permute rows and columns -sm2 = sm1 * perm; // Permute the columns +sm2 = sm1 * perm; // Permute the rows sm2 = perm * sm1; // Permute the columns \endcode diff --git a/doc/examples/matrixfree_cg.cpp b/doc/examples/matrixfree_cg.cpp index 74699381c..413c9a3f8 100644 --- a/doc/examples/matrixfree_cg.cpp +++ b/doc/examples/matrixfree_cg.cpp @@ -9,7 +9,7 @@ using Eigen::SparseMatrix; namespace Eigen { namespace internal { - // MatrixReplacement looks-like a SparseMatrix, so let's inherits its traits: + // MatrixReplacement looks-like a SparseMatrix, so let's inherit its traits: template<> struct traits : public Eigen::internal::traits > {}; -- GitLab From 72b0759451b26af70bed8132731d887d9384f7f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Mon, 20 Mar 2023 16:59:38 +0000 Subject: [PATCH 221/266] Fix arm builds. (cherry picked from commit 2c8011c2dd72d6c2086b181aad8cbb6204fed5db) --- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- Eigen/src/Core/util/Macros.h | 6 +++--- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e57d9c91c..5915f6af8 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -145,7 +145,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b return shuffle2(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); } #define vec4f_duplane(a, p) \ - vdupq_lane_f32(vget_low_f32(a), p) + Packet4f(vdupq_lane_f32(vget_low_f32(a), p)) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -3696,7 +3696,7 @@ EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b) return shuffle(a, b, 3); } #define vec2d_duplane(a, p) \ - vdupq_laneq_f64(a, p) + Packet2d(vdupq_laneq_f64(a, p)) template<> struct packet_traits : default_packet_traits { diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index b436dfad3..90ea9b2e1 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -275,7 +275,7 @@ /// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE /// compliant Arm fp16 type -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM_OR_ARM64 #ifndef EIGEN_HAS_ARM64_FP16 #if defined(__ARM_FP16_FORMAT_IEEE) #define EIGEN_HAS_ARM64_FP16 1 @@ -287,7 +287,7 @@ /// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture /// supports Neon vector intrinsics for fp16. -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM_OR_ARM64 #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 @@ -299,7 +299,7 @@ /// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture /// supports Neon scalar intrinsics for fp16. -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM_OR_ARM64 #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index b90a1dcd6..a4922e913 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -263,8 +263,8 @@ namespace internal { template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array customIndices2Array(IndexType& idx, numeric_list) { - return { idx[Is]... }; + array customIndices2Array(IndexType& idx, numeric_list) { + return { static_cast(idx[First]), static_cast(idx[Is])... }; } template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -- GitLab From d4c24eca9696ee45635a7e96dfeb9afb162a2d9e Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Wed, 5 Apr 2023 17:06:14 +0000 Subject: [PATCH 222/266] Don't crash on empty tensor contraction. (cherry picked from commit b0f877f8e01e90a5b0f3a79d46ea234899f8b499) --- unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h | 1 - 1 file changed, 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 8b35f7985..fa36da195 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -89,7 +89,6 @@ struct TensorContractionBlockMemAllocator { eigen_assert(rhs_block); BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); char* block_mem = static_cast(d.allocate(sz.lhs_size + sz.rhs_size)); - eigen_assert(block_mem); *lhs_block = reinterpret_cast(block_mem); *rhs_block = reinterpret_cast(block_mem + sz.lhs_size); return block_mem; -- GitLab From 6f9bffe8ddd85c97fb3b7d7179276d5f3d906d82 Mon Sep 17 00:00:00 2001 From: Rohit Goswami <308505-HaoZeke@users.noreply.gitlab.com> Date: Thu, 6 Apr 2023 19:20:41 +0000 Subject: [PATCH 223/266] DOC: Update documentation for 3.4.x (cherry picked from commit b0eded878d5d162d61583a286c0d8a45406ad1bc) --- doc/TutorialSlicingIndexing.dox | 4 ++-- doc/examples/matrixfree_cg.cpp | 2 +- doc/snippets/Tridiagonalization_decomposeInPlace.cpp | 3 ++- doc/snippets/compile_snippet.cpp.in | 1 + 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox index e8c860a83..60f1edca3 100644 --- a/doc/TutorialSlicingIndexing.dox +++ b/doc/TutorialSlicingIndexing.dox @@ -14,8 +14,8 @@ In particular, it supports \b slicing that consists in taking a set of rows, col All the aforementioned operations are handled through the generic DenseBase::operator()(const RowIndices&, const ColIndices&) method. Each argument can be: - An integer indexing a single row or column, including symbolic indices. - - The symbol Eigen::all representing the whole set of respective rows or columns in increasing order. - - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::lastN functions. + - The symbol Eigen::placeholders::all representing the whole set of respective rows or columns in increasing order. + - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::placeholders::lastN functions. - Any 1D vector/array of integers including %Eigen's vector/array, expressions, std::vector, std::array, as well as plain C arrays: `int[N]`. More generally, it can accepts any object exposing the following two member functions: diff --git a/doc/examples/matrixfree_cg.cpp b/doc/examples/matrixfree_cg.cpp index 413c9a3f8..cc0eead13 100644 --- a/doc/examples/matrixfree_cg.cpp +++ b/doc/examples/matrixfree_cg.cpp @@ -66,7 +66,7 @@ namespace internal { { // This method should implement "dst += alpha * lhs * rhs" inplace, // however, for iterative solvers, alpha is always equal to 1, so let's not bother about it. - assert(alpha==Scalar(1) && "scaling is not implemented"); + eigen_assert(alpha==Scalar(1) && "scaling is not implemented"); EIGEN_ONLY_USED_FOR_DEBUG(alpha); // Here we could simply call dst.noalias() += lhs.my_matrix() * rhs, diff --git a/doc/snippets/Tridiagonalization_decomposeInPlace.cpp b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp index 3cdce679b..9a66baa76 100644 --- a/doc/snippets/Tridiagonalization_decomposeInPlace.cpp +++ b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp @@ -5,7 +5,8 @@ cout << "Here is a random symmetric 5x5 matrix:" << endl << A << endl << endl; VectorXd diag(5); VectorXd subdiag(4); VectorXd hcoeffs(4); // Scratch space for householder reflector. -internal::tridiagonalization_inplace(A, diag, subdiag, hcoeffs, true); +VectorXd workspace(5); +internal::tridiagonalization_inplace(A, diag, subdiag, hcoeffs, workspace, true); cout << "The orthogonal matrix Q is:" << endl << A << endl; cout << "The diagonal of the tridiagonal matrix T is:" << endl << diag << endl; cout << "The subdiagonal of the tridiagonal matrix T is:" << endl << subdiag << endl; diff --git a/doc/snippets/compile_snippet.cpp.in b/doc/snippets/compile_snippet.cpp.in index c11457a3f..04f276d0b 100644 --- a/doc/snippets/compile_snippet.cpp.in +++ b/doc/snippets/compile_snippet.cpp.in @@ -2,6 +2,7 @@ static bool eigen_did_assert = false; #define eigen_assert(X) if(!eigen_did_assert && !(X)){ std::cout << "### Assertion raised in " << __FILE__ << ":" << __LINE__ << ":\n" #X << "\n### The following would happen without assertions:\n"; eigen_did_assert = true;} #include +#include #include #ifndef M_PI -- GitLab From f04d02dbf65a6030e83747313191449795dc400b Mon Sep 17 00:00:00 2001 From: Rob Conde Date: Wed, 12 Apr 2023 10:36:08 -0400 Subject: [PATCH 224/266] exclude `Eigen/Core` and `Eigen/src/Core` from being ignored due to `core` ignore rule (cherry picked from commit 990a282fc40e9fb62a7aea1ba67b5c00ed838732) --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index f6ab76fda..19dfac9ef 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,6 @@ lapack/reference .settings Makefile !ci/build.gitlab-ci.yml +!scripts/buildtests.in +!Eigen/Core +!Eigen/src/Core -- GitLab From f296720d7dcdd26a4279824bceb1947dede2e21d Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 18 Apr 2023 13:27:47 -0700 Subject: [PATCH 225/266] Make sure we return +/-1 above the clamping point for Erf(). (cherry picked from commit b378014fef017a829fb42c7fad15f3764bfb8ef9) --- .../src/SpecialFunctions/SpecialFunctionsImpl.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index bd2ac8308..7634bf72f 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -301,12 +301,9 @@ struct digamma_impl { This implementation works on both scalars and Ts. */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) { - // Clamp the inputs to the range [-4, 4] since anything outside - // this range is +/-1.0f in single-precision. - const T plus_4 = pset1(4.f); - const T minus_4 = pset1(-4.f); - const T x = pmax(pmin(a_x, plus_4), minus_4); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) { + const float kErfInvOneMinusHalfULP = 3.832506856900711f; + const T clamp = pcmp_le(pset1(kErfInvOneMinusHalfULP), pabs(x)); // The monomial coefficients of the numerator polynomial (odd). const T alpha_1 = pset1(-1.60960333262415e-02f); const T alpha_3 = pset1(-2.95459980854025e-03f); @@ -342,7 +339,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) { q = pmadd(x2, q, beta_0); // Divide the numerator by the denominator. - return pdiv(p, q); + const T sign = pselect(pcmp_le(x, pset1(0.0f)), pset1(-1.0f), pset1(1.0f)); + return pselect(clamp, sign, pdiv(p, q)); } template -- GitLab From a4ecfd8ead9b1ffd62265e04e5ecd7b73f86949b Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 25 Apr 2023 15:24:49 +0000 Subject: [PATCH 226/266] Fix boolean bitwise and warning. (cherry picked from commit 70410310a40bf6f790e8f3c785e78aa2b8f4ca12) --- test/product_small.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/product_small.cpp b/test/product_small.cpp index 1d6df6e58..fec7f5658 100644 --- a/test/product_small.cpp +++ b/test/product_small.cpp @@ -70,7 +70,7 @@ void test_dynamic_bool() for(Index i=0;i Date: Tue, 2 May 2023 17:48:21 +0000 Subject: [PATCH 227/266] JacobiSVD: set m_nonzeroSingularValues to zero if not finite (cherry picked from commit fdc749de2ac1dd6ab25298dd60ab8d594992fd07) --- Eigen/src/SVD/JacobiSVD.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h index 9d95acdf6..4b002ad4c 100644 --- a/Eigen/src/SVD/JacobiSVD.h +++ b/Eigen/src/SVD/JacobiSVD.h @@ -680,6 +680,7 @@ JacobiSVD::compute(const MatrixType& matrix, unsig if (!(numext::isfinite)(scale)) { m_isInitialized = true; m_info = InvalidInput; + m_nonzeroSingularValues = 0; return *this; } if(scale==RealScalar(0)) scale = RealScalar(1); -- GitLab From 26b8fabd80e882235682f58331ef232bf78b9f10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 5 May 2023 16:27:26 +0000 Subject: [PATCH 228/266] Return NaN in ndtri for values outside valid input range. (cherry picked from commit 1f79a6078fb77da47069c8aec23c4e309fb982e2) --- .../Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h | 10 +++++----- unsupported/test/special_functions.cpp | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index 7634bf72f..243ffdd5e 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -471,9 +471,9 @@ struct erfc_impl { * ERROR MESSAGES: * * message condition value returned - * ndtri domain x <= 0 -MAXNUM - * ndtri domain x >= 1 MAXNUM - * + * ndtri domain x == 0 -INF + * ndtri domain x == 1 INF + * ndtri domain x < 0, x > 1 NAN */ /* Cephes Math Library Release 2.2: June, 1992 @@ -635,8 +635,8 @@ T generic_ndtri(const T& a) { generic_ndtri_lt_exp_neg_two(b, should_flipsign)); return pselect( - pcmp_le(a, zero), neg_maxnum, - pselect(pcmp_le(one, a), maxnum, ndtri)); + pcmp_eq(a, zero), neg_maxnum, + pselect(pcmp_eq(one, a), maxnum, ndtri)); } template diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp index 756f031c2..44c77535e 100644 --- a/unsupported/test/special_functions.cpp +++ b/unsupported/test/special_functions.cpp @@ -171,9 +171,9 @@ template void array_special_functions() // Check the ndtri function against scipy.special.ndtri { - ArrayType x(7), res(7), ref(7); - x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01; - ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408; + ArrayType x(11), res(11), ref(11); + x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01, 0, 1, -0.01, 1.01; + ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408, -plusinf, plusinf, nan, nan; CALL_SUBTEST( verify_component_wise(ref, ref); ); CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); ); CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); ); -- GitLab From af3ca50f0b895c45e72130312cf9e78ba162454f Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 9 May 2023 18:53:56 +0000 Subject: [PATCH 229/266] Work around compiler bug in Umeyama.h. (cherry picked from commit 524c329ab23fb565eff8aa2bf36d134e97773a99) --- Eigen/src/Geometry/Umeyama.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index 6b755008f..0c167c76b 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -137,7 +137,8 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo VectorType S = VectorType::Ones(m); if ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) - S(m-1) = -1; + Index tmp = m - 1; + S(tmp) = -1; // Eq. (40) and (43) Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); -- GitLab From 9b3d104c0239d52477bfd25961dbd41007cc1155 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Tue, 9 May 2023 19:10:50 +0000 Subject: [PATCH 230/266] Add missing braces in Umeyama.h (cherry picked from commit 1321821e86ebbfdf8ccda7f89b59f19feb023dbc) --- Eigen/src/Geometry/Umeyama.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index 0c167c76b..2a5c395b2 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -136,9 +136,10 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo // Eq. (39) VectorType S = VectorType::Ones(m); - if ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) + if ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) { Index tmp = m - 1; S(tmp) = -1; + } // Eq. (40) and (43) Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); -- GitLab From 357bb110661a3d6edd394eba55b49bc1afbab46f Mon Sep 17 00:00:00 2001 From: Alejandro Acosta Date: Mon, 29 May 2023 17:23:58 +0100 Subject: [PATCH 231/266] Replace usage of CudaStreamDevice with GpuStreamDevice in tensor benchmarks GPU (cherry picked from commit 07e4604b1961a32bbe21841a1e97fc274b50c443) --- bench/tensors/tensor_benchmarks_fp16_gpu.cu | 6 +++--- bench/tensors/tensor_benchmarks_gpu.cu | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu index 65784d0d6..d63ff8bb2 100644 --- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu +++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu @@ -10,7 +10,7 @@ #define BM_FuncGPU(FUNC) \ static void BM_##FUNC(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ @@ -40,7 +40,7 @@ BM_FuncGPU(fullReduction); #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ @@ -59,7 +59,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64); #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index 76d68c5c1..c77810264 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -10,7 +10,7 @@ #define BM_FuncGPU(FUNC) \ static void BM_##FUNC(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ @@ -40,7 +40,7 @@ BM_FuncGPU(fullReduction); #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ @@ -59,7 +59,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64); #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ -- GitLab From ebfdd6bdeaafa4aea6136e759b7bece170055408 Mon Sep 17 00:00:00 2001 From: Alexander Shaposhnikov Date: Wed, 31 May 2023 15:15:06 +0000 Subject: [PATCH 232/266] Do not set EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC for cuda compilation (cherry picked from commit 316eab8deb574d150f9cfc7f8b170156dc0cdd9f) --- Eigen/src/Core/util/Macros.h | 44 ++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 90ea9b2e1..6cbb8ec67 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -285,28 +285,6 @@ #endif #endif -/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture -/// supports Neon vector intrinsics for fp16. -#if EIGEN_ARCH_ARM_OR_ARM64 - #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC - #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 - #else - #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0 - #endif - #endif -#endif - -/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture -/// supports Neon scalar intrinsics for fp16. -#if EIGEN_ARCH_ARM_OR_ARM64 - #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC - #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) - #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 - #endif - #endif -#endif - /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS #if defined(__mips__) || defined(__mips) #define EIGEN_ARCH_MIPS 1 @@ -565,6 +543,28 @@ // #endif +/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture +/// supports Neon vector intrinsics for fp16. +#if EIGEN_ARCH_ARM_OR_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 + #else + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0 + #endif + #endif +#endif + +/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture +/// supports Neon scalar intrinsics for fp16. +#if EIGEN_ARCH_ARM_OR_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC + #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) + #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 + #endif + #endif +#endif + #if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__) // EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro. // In most cases we want to check if both macros are defined which can be done using the define below. -- GitLab From e6e921f0e352bafc51d965ea0ac42a38ce7d372b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Mon, 26 Jun 2023 18:39:42 +0000 Subject: [PATCH 233/266] Disable FP16 arithmetic for arm32. (cherry picked from commit 7465b7651edfb58322557179658853243eb96372) --- Eigen/src/Core/arch/NEON/PacketMath.h | 4 ++-- Eigen/src/Core/util/Macros.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 5915f6af8..f6d6d635a 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -3941,6 +3941,8 @@ template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); } +#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG + // Do we have an fp16 types and supporting Neon intrinsics? #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC typedef float16x4_t Packet4hf; @@ -4601,8 +4603,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& } #endif // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC -#endif // EIGEN_ARCH_ARM64 - } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 6cbb8ec67..6c41bf2ee 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -545,7 +545,7 @@ /// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture /// supports Neon vector intrinsics for fp16. -#if EIGEN_ARCH_ARM_OR_ARM64 +#if EIGEN_ARCH_ARM64 #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 @@ -557,7 +557,7 @@ /// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture /// supports Neon scalar intrinsics for fp16. -#if EIGEN_ARCH_ARM_OR_ARM64 +#if EIGEN_ARCH_ARM64 #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 -- GitLab From 33ba98b64107db42a691708581cd70af0d2b72da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Mon, 26 Jun 2023 19:21:54 +0000 Subject: [PATCH 234/266] Ensure EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC is always defined on arm. (cherry picked from commit 31cd2ad371bf8b7486951dfcd20381427aba5546) --- Eigen/src/Core/util/Macros.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 6c41bf2ee..eebfd901d 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -545,9 +545,11 @@ /// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture /// supports Neon vector intrinsics for fp16. -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM_OR_ARM64 #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC - #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) + // Clang only supports FP16 on aarch64, and not all intrinsics are available + // on A32 anyways even in GCC (e.g. vdiv_f16, vsqrt_f16). + #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 #else #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0 @@ -557,9 +559,11 @@ /// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture /// supports Neon scalar intrinsics for fp16. -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM_OR_ARM64 #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC - #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) + // Clang only supports FP16 on aarch64, and not all intrinsics are available + // on A32 anyways, even in GCC (e.g. vceqh_f16). + #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 #endif #endif -- GitLab From daa0b70a656d3c03f1a518c06ac0845df654b1e4 Mon Sep 17 00:00:00 2001 From: Kevin Leonardic Date: Mon, 3 Jul 2023 13:44:20 +0200 Subject: [PATCH 235/266] Fix argument for _mm256_cvtps_ph imm parameter (cherry picked from commit d4b05454a7b33139ce6636584550780ff15af6ed) --- Eigen/src/Core/arch/AVX/PacketMath.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index 5be14ba11..24e01c46f 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -1020,7 +1020,7 @@ EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #ifdef EIGEN_HAS_FP16_C - return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT); #else EIGEN_ALIGN32 float aux[8]; pstore(aux, a); -- GitLab From 0f39c851a5591d2d5bce82e5539a009e531ac6ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Fri, 7 Jul 2023 18:37:14 +0000 Subject: [PATCH 236/266] Fix use of arg function in CUDA. (cherry picked from commit 63dcb429cd91f7e1cdfaffb894bb368502ee0c38) --- Eigen/src/Core/MathFunctions.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index bff595300..48184cc6a 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -588,12 +588,8 @@ struct arg_default_impl { EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - // HIP does not seem to have a native device side implementation for the math routine "arg" + // There is no official ::arg on device in CUDA/HIP, so we always need to use std::arg. using std::arg; - #else - EIGEN_USING_STD(arg); - #endif return static_cast(arg(x)); } }; -- GitLab From 1ec1b16d36114fe6c52bb39a6c3ec41eeefcc923 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 14 Sep 2021 13:53:55 -0700 Subject: [PATCH 237/266] Add buildtests_gpu and check_gpu to simplify GPU testing. This is in preparation of adding GPU tests to the CI, allowing us to limit building/testing of GPU-specific tests for a given GPU-capable runner. GPU tests are tagged with the label "gpu". The new targets ``` make buildtests_gpu make check_gpu ``` allow building and running only the gpu tests. (cherry picked from commit 16f9a20a6f408fe8896d9c07c91e23e8159a9d2c) --- cmake/EigenConfigureTesting.cmake | 9 +++++++++ cmake/EigenTesting.cmake | 18 ++++++++++++------ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index 9cb3bb20b..add6aab53 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake @@ -11,6 +11,15 @@ add_custom_target(buildtests) add_custom_target(check COMMAND "ctest") add_dependencies(check buildtests) +# Convenience target for only building GPU tests. +add_custom_target(buildtests_gpu) +add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure" + "--no-compress-output" + "--build-no-clean" + "-T" "test" + "-L" "gpu") +add_dependencies(check_gpu buildtests_gpu) + # check whether /bin/bash exists (disabled as not used anymore) # find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index bc47da86d..995354f05 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -23,7 +23,9 @@ macro(ei_add_test_internal testname testname_with_suffix) set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}${targetname}\n") set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}") + set(is_gpu_test OFF) if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) + set(is_gpu_test ON) if(EIGEN_TEST_HIP) hip_reset_flags() hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}") @@ -57,10 +59,10 @@ macro(ei_add_test_internal testname testname_with_suffix) add_executable(${targetname} ${filename}) endif() - if (targetname MATCHES "^eigen2_") - add_dependencies(eigen2_buildtests ${targetname}) - else() - add_dependencies(buildtests ${targetname}) + add_dependencies(buildtests ${targetname}) + + if (is_gpu_test) + add_dependencies(buildtests_gpu ${targetname}) endif() if(EIGEN_NO_ASSERTION_CHECKING) @@ -118,6 +120,11 @@ macro(ei_add_test_internal testname testname_with_suffix) add_dependencies("Build${current_subproject}" ${targetname}) set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}") endif() + if (is_gpu_test) + # Add gpu tag for testing only GPU tests. + set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu") + endif() + if(EIGEN_SYCL) # Force include of the SYCL file at the end to avoid errors. set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1) @@ -775,8 +782,7 @@ macro(ei_add_smoke_tests smoke_test_list) if ("${test}" IN_LIST EIGEN_SUBTESTS_LIST) add_dependencies("${buildtarget}" "${test}") # Add label smoketest to be able to run smoketests using ctest - get_property(test_labels TEST ${test} PROPERTY LABELS) - set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest") + set_property(TEST ${test} APPEND PROPERTY LABELS "smoketest") endif() endforeach() endmacro(ei_add_smoke_tests) -- GitLab From dfcd6de20a6f9601c76be65794c611336450e750 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 24 Sep 2021 14:43:59 -0700 Subject: [PATCH 238/266] Clean up CUDA CMake files. - Unify test/CMakeLists.txt and unsupported/test/CMakeLists.txt - Added `EIGEN_CUDA_FLAGS` that are appended to the set of flags passed to the cuda compiler (nvcc or clang). The latter is to support passing custom flags (e.g. `-arch=` to nvcc, or to disable cuda-specific warnings). (cherry picked from commit 7b00e8b186a7679b0f46be742809a55d07d4efe8) --- CMakeLists.txt | 3 ++- test/CMakeLists.txt | 36 ++++++++++++++++----------------- unsupported/test/CMakeLists.txt | 30 ++++++++++----------------- 3 files changed, 30 insertions(+), 39 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index faaaa265f..2c5ac6512 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -437,7 +437,8 @@ if(EIGEN_TEST_NO_EXCEPTIONS) message(STATUS "Disabling exceptions in tests/examples") endif() -set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code") +set(EIGEN_CUDA_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.") +set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code") include_directories(${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 0900435d5..c81ea2dd2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -375,33 +375,33 @@ if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang") message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.") endif() -if(EIGEN_TEST_CUDA) +find_package(CUDA 9.0) +if(CUDA_FOUND AND EIGEN_TEST_CUDA) + # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor + # and -fno-check-new flags since they trigger thousands of compilation warnings + # in the CUDA runtime + string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -find_package(CUDA 5.0) -if(CUDA_FOUND) - - set(CUDA_PROPAGATE_HOST_FLAGS OFF) - - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr") - if (${CUDA_VERSION} STREQUAL "7.0") - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") - endif() - - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE) - endif() if(EIGEN_TEST_CUDA_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}") foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}") endforeach() + string(APPEND CMAKE_CXX_FLAGS "${EIGEN_CUDA_FLAGS}") else() - foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) - string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}") + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + set(NVCC_ARCH_FLAGS) + foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") endforeach() + set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_FLAGS}") + cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") endif() - string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(gpu_basic) @@ -410,8 +410,6 @@ if(CUDA_FOUND) endif() -endif() - # HIP unit tests option(EIGEN_TEST_HIP "Add HIP support." OFF) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index d30fa62bd..f87215c8b 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -289,8 +289,7 @@ endif() endif() -# These tests needs nvcc -find_package(CUDA 7.0) +find_package(CUDA 9.0) if(CUDA_FOUND AND EIGEN_TEST_CUDA) # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor # and -fno-check-new flags since they trigger thousands of compilation warnings @@ -302,30 +301,23 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) - - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE) - endif() if(EIGEN_TEST_CUDA_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}") foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}") endforeach() + string(APPEND CMAKE_CXX_FLAGS "${EIGEN_CUDA_FLAGS}") + else() + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + set(NVCC_ARCH_FLAGS) + foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") + endforeach() + set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_FLAGS}") + cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") endif() - - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr") - if (${CUDA_VERSION} STREQUAL "7.0") - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") - endif() - - set(NVCC_ARCH_FLAGS) - foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) - string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") - endforeach() - set(CUDA_NVCC_FLAGS "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}") - cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(cxx11_tensor_complex_gpu) -- GitLab From a605d6b996fa87c89036a18d442ddd3689cf3fdb Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 24 Sep 2021 20:15:55 -0700 Subject: [PATCH 239/266] Rename EIGEN_CUDA_FLAGS to EIGEN_CUDA_CXX_FLAGS Also add a missing space for clang. (cherry picked from commit 846d34384af80b80793d32257a7f917eeece41d4) --- CMakeLists.txt | 2 +- test/CMakeLists.txt | 4 ++-- unsupported/test/CMakeLists.txt | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c5ac6512..790fcf35d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -437,7 +437,7 @@ if(EIGEN_TEST_NO_EXCEPTIONS) message(STATUS "Disabling exceptions in tests/examples") endif() -set(EIGEN_CUDA_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.") +set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.") set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code") include_directories(${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c81ea2dd2..4aff37797 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -391,14 +391,14 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}") endforeach() - string(APPEND CMAKE_CXX_FLAGS "${EIGEN_CUDA_FLAGS}") + string(APPEND CMAKE_CXX_FLAGS " ${EIGEN_CUDA_CXX_FLAGS}") else() set(CUDA_PROPAGATE_HOST_FLAGS OFF) set(NVCC_ARCH_FLAGS) foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") endforeach() - set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_FLAGS}") + set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") endif() diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index f87215c8b..ab5b684e9 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -307,14 +307,14 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}") endforeach() - string(APPEND CMAKE_CXX_FLAGS "${EIGEN_CUDA_FLAGS}") + string(APPEND CMAKE_CXX_FLAGS " ${EIGEN_CUDA_CXX_FLAGS}") else() set(CUDA_PROPAGATE_HOST_FLAGS OFF) set(NVCC_ARCH_FLAGS) foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") endforeach() - set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_FLAGS}") + set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") endif() -- GitLab From 89a71f31260aff101a61dcb90e5ddbb86c39be86 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 1 Oct 2021 10:20:50 -0700 Subject: [PATCH 240/266] Fix gpu special function tests. Some checks used incorrect values, partly from copy-paste errors, partly from the change in behaviour introduced in !398. Modified results to match scipy, simplified tests by updating `VERIFY_IS_CWISE_APPROX` to work for scalars. (cherry picked from commit 701f5d1c91c770e558c7760da14ff3365757e275) --- test/main.h | 18 +++++++++++++ unsupported/test/cxx11_tensor_gpu.cu | 39 ++++++++-------------------- 2 files changed, 29 insertions(+), 28 deletions(-) diff --git a/test/main.h b/test/main.h index 3dd094249..a5b68e2e8 100644 --- a/test/main.h +++ b/test/main.h @@ -391,6 +391,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define VERIFY_IS_NOT_MUCH_SMALLER_THAN(a, b) VERIFY(!test_isMuchSmallerThan(a, b)) #define VERIFY_IS_APPROX_OR_LESS_THAN(a, b) VERIFY(test_isApproxOrLessThan(a, b)) #define VERIFY_IS_NOT_APPROX_OR_LESS_THAN(a, b) VERIFY(!test_isApproxOrLessThan(a, b)) +#define VERIFY_IS_CWISE_EQUAL(a, b) VERIFY(verifyIsCwiseApprox(a, b, true)) +#define VERIFY_IS_CWISE_APPROX(a, b) VERIFY(verifyIsCwiseApprox(a, b, false)) #define VERIFY_IS_UNITARY(a) VERIFY(test_isUnitary(a)) @@ -598,6 +600,22 @@ inline bool verifyIsApprox(const Type1& a, const Type2& b) return ret; } +// verifyIsCwiseApprox is a wrapper to test_isCwiseApprox that outputs the relative difference magnitude if the test fails. +template +inline bool verifyIsCwiseApprox(const Type1& a, const Type2& b, bool exact) +{ + bool ret = test_isCwiseApprox(a,b,exact); + if(!ret) { + if (exact) { + std::cerr << "Values are not an exact match"; + } else { + std::cerr << "Difference too large wrt tolerance " << get_test_precision(a); + } + std::cerr << ", relative error is: " << test_relative_error(a,b) << std::endl; + } + return ret; +} + // The idea behind this function is to compare the two scalars a and b where // the scalar ref is a hint about the expected order of magnitude of a and b. // WARNING: the scalar a and b must be positive diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index 137d0d596..0a37c0293 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -681,8 +681,8 @@ void test_gpu_digamma() expected_out(2) = Scalar(1.2561176684318); expected_out(3) = Scalar(2.398239129535781); expected_out(4) = Scalar(9.210340372392849); - expected_out(5) = std::numeric_limits::infinity(); - expected_out(6) = std::numeric_limits::infinity(); + expected_out(5) = std::numeric_limits::quiet_NaN(); + expected_out(6) = std::numeric_limits::quiet_NaN(); std::size_t bytes = in.size() * sizeof(Scalar); @@ -704,11 +704,8 @@ void test_gpu_digamma() assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - for (int i = 0; i < 5; ++i) { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } - for (int i = 5; i < 7; ++i) { - VERIFY_IS_EQUAL(out(i), expected_out(i)); + for (int i = 0; i < 7; ++i) { + VERIFY_IS_CWISE_APPROX(out(i), expected_out(i)); } gpuFree(d_in); @@ -741,7 +738,7 @@ void test_gpu_zeta() expected_out(0) = std::numeric_limits::infinity(); expected_out(1) = Scalar(1.61237534869); expected_out(2) = Scalar(0.234848505667); - expected_out(3) = Scalar(1.03086757337e-5); + expected_out(3) = std::numeric_limits::quiet_NaN(); expected_out(4) = Scalar(0.367879440865); expected_out(5) = Scalar(0.054102025820864097); @@ -769,13 +766,8 @@ void test_gpu_zeta() assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - VERIFY_IS_EQUAL(out(0), expected_out(0)); - VERIFY((std::isnan)(out(3))); - - for (int i = 1; i < 6; ++i) { - if (i != 3) { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_CWISE_APPROX(out(i), expected_out(i)); } gpuFree(d_in_x); @@ -1117,13 +1109,8 @@ void test_gpu_ndtri() assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - VERIFY_IS_EQUAL(out(0), expected_out(0)); - VERIFY((std::isnan)(out(3))); - - for (int i = 1; i < 6; ++i) { - if (i != 3) { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_CWISE_APPROX(out(i), expected_out(i)); } gpuFree(d_in_x); @@ -1262,12 +1249,8 @@ void test_gpu_betainc() assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - for (int i = 1; i < 125; ++i) { - if ((std::isnan)(expected_out(i))) { - VERIFY((std::isnan)(out(i))); - } else { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } + for (int i = 0; i < 125; ++i) { + VERIFY_IS_CWISE_APPROX(out(i), expected_out(i)); } gpuFree(d_in_x); -- GitLab From 554982beeffcf28457d3989d3cbe7d2968b056bf Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 8 Oct 2021 11:38:13 -0700 Subject: [PATCH 241/266] Disable Tree reduction for GPU. For moderately sized inputs, running the Tree reduction quickly fills/overflows the GPU thread stack space, leading to memory errors. This was happening in the `cxx11_tensor_complex_gpu` test, for example. Disabling tree reduction on GPU fixes this. (cherry picked from commit 24ebb37f38287d65c0e0b60c714e39faffeb5b94) --- .../Eigen/CXX11/src/Tensor/TensorReduction.h | 31 +++++++++++-------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 583f46256..ff7c5a133 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -166,8 +166,12 @@ struct GenericDimReducer<-1, Self, Op> { }; template + bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && + !Self::ReducerTraits::IsExactlyAssociative && + // GPU threads can quickly run out of stack space + // for moderately sized inputs. + !Self::RunningOnGPU + )> struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { typename Self::CoeffReturnType accum = reducer.initialize(); @@ -528,6 +532,18 @@ struct TensorReductionEvaluatorBase::size; + + // For full reductions +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + static constexpr bool RunningOnGPU = internal::is_same::value; + static constexpr bool RunningOnSycl = false; +#elif defined(EIGEN_USE_SYCL) +static const bool RunningOnSycl = internal::is_same::type, Eigen::SyclDevice>::value; +static const bool RunningOnGPU = false; +#else + static constexpr bool RunningOnGPU = false; + static constexpr bool RunningOnSycl = false; +#endif enum { IsAligned = false, @@ -950,17 +966,6 @@ struct TensorReductionEvaluatorBase::value; - static const bool RunningOnSycl = false; -#elif defined(EIGEN_USE_SYCL) -static const bool RunningOnSycl = internal::is_same::type, Eigen::SyclDevice>::value; -static const bool RunningOnGPU = false; -#else - static const bool RunningOnGPU = false; - static const bool RunningOnSycl = false; -#endif EvaluatorPointerType m_result; const Device EIGEN_DEVICE_REF m_device; -- GitLab From ac561cd038c9a6f21a31a1c040f7dc86deb2b59c Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Fri, 1 Oct 2021 05:16:30 +0000 Subject: [PATCH 242/266] Reduce tensor_contract_gpu test. The original test times out after 60 minutes on Windows, even when setting flags to optimize for speed. Reducing the number of contractions performed from 3600->27 for subtests 8,9 allow the two to run in just over a minute each. (cherry picked from commit be9e7d205f38e3e8effdfdded88817b371673930) --- unsupported/test/cxx11_tensor_contract_gpu.cu | 58 ++++++++++++------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu index 575bdc1f9..5abf2131d 100644 --- a/unsupported/test/cxx11_tensor_contract_gpu.cu +++ b/unsupported/test/cxx11_tensor_contract_gpu.cu @@ -25,10 +25,6 @@ typedef Tensor::DimensionPair DimPair; template void test_gpu_contraction(int m_size, int k_size, int n_size) { - std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; - // with these dimensions, the output has 300 * 140 elements, which is - // more than 30 * 1024, which is the number of threads in blocks on - // a 15 SM GK110 GPU Tensor t_left(m_size, k_size); Tensor t_right(k_size, n_size); Tensor t_result(m_size, n_size); @@ -171,25 +167,45 @@ void test_gpu_contraction_n() { template void test_gpu_contraction_sizes() { - int m_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257 , 511, - 512, 513, 1023, 1024, 1025}; - - int n_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; - - int k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; - - for (int i = 0; i < 15; i++) { - for (int j = 0; j < 15; j++) { - for (int k = 0; k < 17; k++) { - test_gpu_contraction(m_sizes[i], n_sizes[j], k_sizes[k]); + int m_sizes[3][5] = {{ 31, 39, 63, 64, 65}, + {127, 129, 255, 257 , 511}, + {512, 513, 1023, 1024, 1025}}; + + int n_sizes[3][5] = {{ 31, 39, 63, 64, 65}, + {127, 129, 255, 257, 511}, + {512, 513, 1023, 1024, 1025}}; + + int k_sizes[3][6] = {{ 31, 39, 63, 64, 65, 95}, + { 96, 127, 129, 255, 257, 511}, + {512, 513, 725, 1023, 1024, 1025}}; + + // Some selection of specific cases. + // - m changes rows each iteration + // - n changes rows each 3 iterations + // - k changes rows each 9 iterations + // - within a row, advance once column each iteration + const int m_cols = 5; + const int n_cols = 5; + const int k_cols = 6; + int m_offset = 0; + int n_offset = 1; + int k_offset = 2; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + for (int l = 0; l < 3; ++l) { + int m = m_sizes[l][m_offset]; + int n = n_sizes[j][n_offset]; + int k = k_sizes[i][k_offset]; + test_gpu_contraction(m, n, k); + n_offset = (n_offset + 1) % n_cols; + k_offset = (k_offset + 1) % k_cols; + } + m_offset = (m_offset + 1) % m_cols; + if (j < 2) { + n_offset = (n_offset + n_cols - 3) % n_cols; // Rewind 3. } } + k_offset = (k_offset + 2 * k_cols - 9) % k_cols; // Rewind 9. } } -- GitLab From 6973687c70a722cd19263152c2f6c1e49862a18f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Tue, 15 Mar 2022 20:22:23 +0000 Subject: [PATCH 243/266] Fix up PowerPC MMA flags so it builds by default. (cherry picked from commit 65eeedf9646ee6efc457cc3a8f8d9030a6f83689) --- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 209 +++++++++--------- .../src/Core/arch/AltiVec/MatrixProductMMA.h | 9 +- 2 files changed, 114 insertions(+), 104 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 8feb88ea7..1888347b1 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -17,24 +17,27 @@ #include "MatrixProductCommon.h" -// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX -#if EIGEN_COMP_LLVM -#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY) -#ifdef __MMA__ -#define EIGEN_ALTIVEC_MMA_ONLY -#else -#define EIGEN_ALTIVEC_DISABLE_MMA -#endif +// Check for MMA builtin support. +#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && defined(__has_builtin) +#if __has_builtin(__builtin_mma_assemble_acc) + #define EIGEN_ALTIVEC_MMA_SUPPORT #endif #endif -#ifdef __has_builtin -#if __has_builtin(__builtin_mma_assemble_acc) - #define ALTIVEC_MMA_SUPPORT -#endif +// Check if and how we should actually use MMA if supported. +#if defined(EIGEN_ALTIVEC_MMA_SUPPORT) + +// Use MMA by default if available. +#if defined(__MMA__) +#define EIGEN_ALTIVEC_MMA_ONLY 1 +// Otherwise, check if we want to enable dynamic dispatch. Not supported by LLVM. +#elif defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH) && !defined(EIGEN_COMP_LLVM) +#define EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH 1 #endif -#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) +#endif // EIGEN_ALTIVEC_MMA_SUPPORT + +#if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) #include "MatrixProductMMA.h" #endif @@ -2477,10 +2480,10 @@ void gebp_kernel::size; void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY + #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only gemm_function = &Eigen::internal::gemmMMA; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemmMMA; } @@ -2490,7 +2493,7 @@ void gebp_kernel; #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2516,20 +2519,20 @@ void gebp_kernel, std::complex, Index, DataMapper, mr void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2554,20 +2557,20 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjugat const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const float*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2592,20 +2595,20 @@ void gebp_kernel, float, Index, DataMapper, mr, nr, Conjugat const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const float*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2629,10 +2632,10 @@ void gebp_kernel::size; void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY + #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only gemm_function = &Eigen::internal::gemmMMA; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemmMMA; } @@ -2642,7 +2645,7 @@ void gebp_kernel; #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2667,20 +2670,20 @@ void gebp_kernel, std::complex, Index, DataMapper, const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2705,20 +2708,20 @@ void gebp_kernel, double, Index, DataMapper, mr, nr, Conjug const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const double*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2743,20 +2746,20 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjug const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const double*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } } // end namespace internal diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 5b4449537..7dda42339 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -11,7 +11,11 @@ #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H +// If using dynamic dispatch, set the CPU target. +#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) +#pragma GCC push_options #pragma GCC target("cpu=power10,htm") +#endif #ifdef __has_builtin #if !__has_builtin(__builtin_vsx_assemble_pair) @@ -611,10 +615,13 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS #undef advanceRows #undef advanceCols -#pragma GCC reset_options } // end namespace internal } // end namespace Eigen +#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) +#pragma GCC pop_options +#endif + #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H -- GitLab From 17d57fb168ac1463736ee1f4c13c9487684146b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Wed, 16 Mar 2022 19:16:28 +0000 Subject: [PATCH 244/266] Fix up PowerPC MMA flags so it builds by default. (cherry picked from commit 591906477bc8c8102dbefceefe10d81648865394) --- Eigen/src/Core/arch/AltiVec/MatrixProduct.h | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index 1888347b1..ea7749610 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -17,8 +17,12 @@ #include "MatrixProductCommon.h" +#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) +#define EIGEN_ALTIVEC_DISABLE_MMA 0 +#endif + // Check for MMA builtin support. -#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && defined(__has_builtin) +#if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin) #if __has_builtin(__builtin_mma_assemble_acc) #define EIGEN_ALTIVEC_MMA_SUPPORT #endif @@ -27,12 +31,16 @@ // Check if and how we should actually use MMA if supported. #if defined(EIGEN_ALTIVEC_MMA_SUPPORT) -// Use MMA by default if available. -#if defined(__MMA__) -#define EIGEN_ALTIVEC_MMA_ONLY 1 -// Otherwise, check if we want to enable dynamic dispatch. Not supported by LLVM. -#elif defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH) && !defined(EIGEN_COMP_LLVM) +#if !defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH) +#define EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH 0 +#endif + +// Check if we want to enable dynamic dispatch. Not supported by LLVM. +#if EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH && !EIGEN_COMP_LLVM #define EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH 1 +// Otherwise, use MMA by default if available. +#elif defined(__MMA__) +#define EIGEN_ALTIVEC_MMA_ONLY 1 #endif #endif // EIGEN_ALTIVEC_MMA_SUPPORT -- GitLab From 208e44c979bbb5a0ecb4b74b834b93396e48e5dd Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Wed, 19 Jul 2023 16:48:07 +0000 Subject: [PATCH 245/266] fix warnings in tensorreduction and memory --- Eigen/src/Core/util/Memory.h | 4 ++-- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 2bf508440..3aea7df51 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -297,14 +297,14 @@ template EIGEN_DEVICE_FUNC inline T* default_construct_elements_of_a std::size_t i=0; EIGEN_TRY { - for (i = 0; i < size; ++i) ::new (ptr + i) T; - return ptr; + for (i = 0; i < size; ++i) ::new (ptr + i) T; } EIGEN_CATCH(...) { destruct_elements_of_array(ptr, i); EIGEN_THROW; } + return ptr; } /** \internal Copy-constructs the elements of an array. diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index ff7c5a133..f1f4eaab7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -594,7 +594,7 @@ static const bool RunningOnGPU = false; m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); } } else { - m_outputStrides[NumOutputDims - 1] = 1; + m_outputStrides[static_cast(NumOutputDims - 1)] = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); -- GitLab From 75ebef26b64f4fc172f68eca57cb4ca5107ab4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Sat, 5 Mar 2022 05:49:45 +0000 Subject: [PATCH 246/266] Adds new CMake Options for controlling build components. (cherry picked from commit cf82186416d04ea5df2a397d8fe09dc78d40ca65) --- CMakeLists.txt | 6 ++++++ blas/CMakeLists.txt | 3 ++- lapack/CMakeLists.txt | 5 +++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 790fcf35d..121ac94f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -517,6 +517,9 @@ if(BUILD_TESTING) add_subdirectory(failtest) endif() +include(CMakeDetermineFortranCompiler) +option(EIGEN_BUILD_BLAS "Toggles the building of the Eigen Blas library" ${CMAKE_Fortran_COMPILER}) +option(EIGEN_BUILD_LAPACK "Toggles the building of the included Eigen LAPACK library" ${CMAKE_Fortran_COMPILER}) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) add_subdirectory(blas) add_subdirectory(lapack) @@ -634,6 +637,8 @@ set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen) install (TARGETS eigen EXPORT Eigen3Targets) +option(EIGEN_BUILD_CMAKE_PACKAGE "Enables the creation of EigenConfig.cmake and related files" ON) +if(EIGEN_BUILD_CMAKE_PACKAGE) configure_package_config_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake @@ -669,6 +674,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake # Add uninstall target add_custom_target ( uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake) +endif() if (EIGEN_SPLIT_TESTSUITE) ei_split_testsuite("${EIGEN_SPLIT_TESTSUITE}") diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index f3a94ec4a..d07090f69 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -1,6 +1,7 @@ project(EigenBlas CXX) +if(EIGEN_BUILD_BLAS) include(CheckLanguage) check_language(Fortran) if(CMAKE_Fortran_COMPILER) @@ -59,4 +60,4 @@ if(BUILD_TESTING) endif() endif() - +endif() diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index c8ca64001..8d6d75401 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -1,5 +1,7 @@ project(EigenLapack CXX) +if(EIGEN_BUILD_LAPACK AND EIGEN_BUILD_BLAS) + include(CheckLanguage) check_language(Fortran) if(CMAKE_Fortran_COMPILER) @@ -457,3 +459,6 @@ if(EXISTS ${eigen_full_path_to_testing_lapack}) endif() +elseif(EIGEN_BUILD_LAPACK AND NOT EIGEN_BUILD_BLAS) + message(FATAL_ERROR "EIGEN_BUILD_LAPACK requires EIGEN_BUILD_BLAS") +endif() #EIGEN_BUILD_LAPACK -- GitLab From d0bfdc1658ca0b4c659fd3702c351d2c2cdc876c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fabian=20Ke=C3=9Fler?= Date: Sun, 24 Oct 2021 21:00:11 +0200 Subject: [PATCH 247/266] optimize cmake scripts for subproject use (cherry picked from commit 19cacd3ecb9dab73c2dd7bc39d9193e06ba92bdd) --- CMakeLists.txt | 76 +++++++++++++++++++++----------------- blas/CMakeLists.txt | 2 +- unsupported/CMakeLists.txt | 2 +- 3 files changed, 45 insertions(+), 35 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 121ac94f1..ac8042b18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,16 @@ cmake_minimum_required(VERSION 3.5.0) project(Eigen3) +# Remove this block after bumping CMake to v3.21.0 +# PROJECT_IS_TOP_LEVEL is defined then by default +if(CMAKE_VERSION VERSION_LESS 3.21.0) + if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + set(PROJECT_IS_TOP_LEVEL TRUE) + else() + set(PROJECT_IS_TOP_LEVEL FALSE) + endif() +endif() + # guard against in-source builds if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) @@ -23,7 +33,7 @@ endif() ############################################################################# -# retrieve version information # +# retrieve version information # ############################################################################# # automatically parse the version number @@ -127,13 +137,11 @@ if(NOT STANDARD_MATH_LIBRARY_FOUND) "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.") else() - if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}") else() set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}") endif() - endif() if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) @@ -504,8 +512,9 @@ if(EIGEN_BUILD_DOC) endif() -option(BUILD_TESTING "Enable creation of Eigen tests." ON) -if(BUILD_TESTING) +cmake_dependent_option(BUILD_TESTING "Enable creation of tests." ON "PROJECT_IS_TOP_LEVEL" OFF) +option(EIGEN_BUILD_TESTING "Enable creation of Eigen tests." ${BUILD_TESTING}) +if(EIGEN_BUILD_TESTING) include(EigenConfigureTesting) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) @@ -576,7 +585,7 @@ endif() configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY) -if(BUILD_TESTING) +if(EIGEN_BUILD_TESTING) ei_testing_print_summary() endif() @@ -584,34 +593,35 @@ message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}") message(STATUS "") -string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) -if(cmake_generator_tolower MATCHES "makefile") - message(STATUS "Available targets (use: make TARGET):") -else() - message(STATUS "Available targets (use: cmake --build . --target TARGET):") -endif() -message(STATUS "---------+--------------------------------------------------------------") -message(STATUS "Target | Description") -message(STATUS "---------+--------------------------------------------------------------") -message(STATUS "install | Install Eigen. Headers will be installed to:") -message(STATUS " | /") -message(STATUS " | Using the following values:") -message(STATUS " | CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") -message(STATUS " | INCLUDE_INSTALL_DIR: ${INCLUDE_INSTALL_DIR}") -message(STATUS " | Change the install location of Eigen headers using:") -message(STATUS " | cmake . -DCMAKE_INSTALL_PREFIX=yourprefix") -message(STATUS " | Or:") -message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") -message(STATUS "doc | Generate the API documentation, requires Doxygen & LaTeX") -if(BUILD_TESTING) - message(STATUS "check | Build and run the unit-tests. Read this page:") - message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") +if(PROJECT_IS_TOP_LEVEL) + string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) + if(cmake_generator_tolower MATCHES "makefile") + message(STATUS "Available targets (use: make TARGET):") + else() + message(STATUS "Available targets (use: cmake --build . --target TARGET):") + endif() + message(STATUS "---------+--------------------------------------------------------------") + message(STATUS "Target | Description") + message(STATUS "---------+--------------------------------------------------------------") + message(STATUS "install | Install Eigen. Headers will be installed to:") + message(STATUS " | /") + message(STATUS " | Using the following values:") + message(STATUS " | CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") + message(STATUS " | INCLUDE_INSTALL_DIR: ${INCLUDE_INSTALL_DIR}") + message(STATUS " | Change the install location of Eigen headers using:") + message(STATUS " | cmake . -DCMAKE_INSTALL_PREFIX=yourprefix") + message(STATUS " | Or:") + message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") + message(STATUS "doc | Generate the API documentation, requires Doxygen & LaTeX") + if(EIGEN_BUILD_TESTING) + message(STATUS "check | Build and run the unit-tests. Read this page:") + message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") + endif() + message(STATUS "blas | Build BLAS library (not the same thing as Eigen)") + message(STATUS "uninstall| Remove files installed by the install target") + message(STATUS "---------+--------------------------------------------------------------") + message(STATUS "") endif() -message(STATUS "blas | Build BLAS library (not the same thing as Eigen)") -message(STATUS "uninstall| Remove files installed by the install target") -message(STATUS "---------+--------------------------------------------------------------") -message(STATUS "") - set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} ) set ( EIGEN_VERSION_MAJOR ${EIGEN_WORLD_VERSION} ) diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index d07090f69..c530957fb 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -51,7 +51,7 @@ endforeach() if(EIGEN_Fortran_COMPILER_WORKS) -if(BUILD_TESTING) +if(EIGEN_BUILD_TESTING) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) add_subdirectory(testing) # can't do EXCLUDE_FROM_ALL here, breaks CTest else() diff --git a/unsupported/CMakeLists.txt b/unsupported/CMakeLists.txt index 34408c017..67d1f6262 100644 --- a/unsupported/CMakeLists.txt +++ b/unsupported/CMakeLists.txt @@ -2,7 +2,7 @@ add_subdirectory(Eigen) if(EIGEN_BUILD_DOC) add_subdirectory(doc EXCLUDE_FROM_ALL) endif() -if(BUILD_TESTING) +if(EIGEN_BUILD_TESTING) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest else() -- GitLab From 764b132a79aaf33e958169a14a011b2e0eec1400 Mon Sep 17 00:00:00 2001 From: Erik Schultheis Date: Wed, 5 Jan 2022 00:46:09 +0000 Subject: [PATCH 248/266] ensure that eigen::internal::size is not found by ADL, rename to ssize and... (cherry picked from commit 9210e71fb378a0f1542272506dc2759b6c147237) --- Eigen/src/Core/IndexedView.h | 4 ++-- Eigen/src/Core/util/Meta.h | 25 +++++++++++++++++++------ Eigen/src/plugins/IndexedViewMethods.h | 6 +++--- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 1297d6a24..05c2bc9cc 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -122,10 +122,10 @@ public: {} /** \returns number of rows */ - Index rows() const { return internal::size(m_rowIndices); } + Index rows() const { return internal::index_list_size(m_rowIndices); } /** \returns number of columns */ - Index cols() const { return internal::size(m_colIndices); } + Index cols() const { return internal::index_list_size(m_colIndices); } /** \returns the nested expression */ const typename internal::remove_all::type& diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index a7966b867..8751ed23e 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -457,20 +457,33 @@ template struct array_size > { }; #endif + /** \internal - * Analogue of the std::size free function. - * It returns the size of the container or view \a x of type \c T + * Analogue of the std::ssize free function. + * It returns the signed size of the container or view \a x of type \c T * * It currently supports: * - any types T defining a member T::size() const * - plain C arrays as T[N] * + * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function. */ -template -EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); } +#if EIGEN_COMP_CXXVER < 20 +template +EIGEN_CONSTEXPR auto index_list_size(const T& x) { + using R = std::common_type_t>; + return static_cast(x.size()); +} -template -EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; } +template +EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { return N; } +#else +template +EIGEN_CONSTEXPR auto index_list_size(T&& x) { + using std::ssize; + return ssize(std::forward(x)); +} +#endif // EIGEN_COMP_CXXVER /** \internal * Convenient struct to get the result type of a nullary, unary, binary, or diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 5bfb19ac6..15c35b0bf 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -90,8 +90,8 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND return BlockType(derived(), internal::first(actualRowIndices), internal::first(actualColIndices), - internal::size(actualRowIndices), - internal::size(actualColIndices)); + internal::index_list_size(actualRowIndices), + internal::index_list_size(actualColIndices)); } // The following overload returns a Scalar @@ -168,7 +168,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) typename IvcType::type actualIndices = ivcSize(indices); return VectorBlock::value> - (derived(), internal::first(actualIndices), internal::size(actualIndices)); + (derived(), internal::first(actualIndices), internal::index_list_size(actualIndices)); } template -- GitLab From 0db5928f0006fd56a53b37f743a4a0c41166af39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Mon, 16 Oct 2023 19:56:53 +0000 Subject: [PATCH 249/266] Eliminate use of _res. (cherry picked from commit 5bdf58b8df31b0377bc5f57ba63d8479793b7bae) --- Eigen/src/Core/products/GeneralMatrixMatrix.h | 12 ++-- .../products/GeneralMatrixMatrixTriangular.h | 18 +++--- .../Core/products/SelfadjointMatrixMatrix.h | 42 ++++++------ .../Core/products/TriangularMatrixVector.h | 42 ++++++------ .../products/TriangularMatrixVector_BLAS.h | 64 +++++++++---------- .../SparseSparseProductWithPruning.h | 24 +++---- test/main.h | 3 + 7 files changed, 104 insertions(+), 101 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index caa65fccc..73ddd260e 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -59,9 +59,9 @@ typedef gebp_traits Traits; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; static void run(Index rows, Index cols, Index depth, - const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resIncr, Index resStride, + const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsStride, + ResScalar* res_, Index resIncr, Index resStride, ResScalar alpha, level3_blocking& blocking, GemmParallelInfo* info = 0) @@ -69,9 +69,9 @@ static void run(Index rows, Index cols, Index depth, typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs, lhsStride); - RhsMapper rhs(_rhs, rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_, lhsStride); + RhsMapper rhs(rhs_, rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 6ba0d9bdb..c0b5d8050 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -60,9 +60,9 @@ template { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resIncr, Index resStride, + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsStride, + ResScalar* res_, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking& blocking) { typedef gebp_traits Traits; @@ -70,9 +70,9 @@ struct general_matrix_matrix_triangular_product LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); Index mc = (std::min)(size,blocking.mc()); @@ -113,7 +113,7 @@ struct general_matrix_matrix_triangular_product::ret }; - void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { typedef blas_data_mapper ResMapper; typedef blas_data_mapper BufferMapper; - ResMapper res(_res, resStride, resIncr); + ResMapper res(res_, resStride, resIncr); gebp_kernel gebp_kernel1; gebp_kernel gebp_kernel2; diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index 33ecf10f6..86c8f3ee9 100644 --- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -43,7 +43,7 @@ struct symm_pack_lhs for(Index w=0; w::type>::half HalfPacket; typedef typename unpacket_traits::type>::half>::half QuarterPacket; @@ -53,7 +53,7 @@ struct symm_pack_lhs HasHalf = (int)HalfPacketSize < (int)PacketSize, HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; - const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper lhs(lhs_,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; @@ -101,11 +101,11 @@ template struct symm_pack_rhs { enum { PacketSize = packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2) { Index end_k = k2 + rows; Index count = 0; - const_blas_data_mapper rhs(_rhs,rhsStride); + const_blas_data_mapper rhs(rhs_,rhsStride); Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; @@ -330,8 +330,8 @@ struct product_selfadjoint_matrix& blocking); }; @@ -342,9 +342,9 @@ template EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = rows; @@ -355,10 +355,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix LhsTransposeMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - LhsTransposeMapper lhs_transpose(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + LhsTransposeMapper lhs_transpose(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -425,8 +425,8 @@ struct product_selfadjoint_matrix& blocking); }; @@ -437,9 +437,9 @@ template EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = cols; @@ -448,8 +448,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix LhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - ResMapper res(_res,resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + ResMapper res(res_,resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -466,7 +466,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix GEPP for(Index i2=0; i2 EIGEN_DONT_INLINE void triangular_matrix_vector_product - ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) + ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const RhsScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; - Index size = (std::min)(_rows,_cols); - Index rows = IsLower ? _rows : (std::min)(_rows,_cols); - Index cols = IsLower ? (std::min)(_rows,_cols) : _cols; + Index size = (std::min)(rows_,cols_); + Index rows = IsLower ? rows_ : (std::min)(rows_,cols_); + Index cols = IsLower ? (std::min)(rows_,cols_) : cols_; typedef Map, 0, OuterStride<> > LhsMap; - const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if::type cjLhs(lhs); typedef Map, 0, InnerStride<> > RhsMap; - const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); + const RhsMap rhs(rhs_,cols,InnerStride<>(rhsIncr)); typename conj_expr_if::type cjRhs(rhs); typedef Map > ResMap; - ResMap res(_res,rows); + ResMap res(res_,rows); typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; @@ -84,7 +84,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product EIGEN_DONT_INLINE void triangular_matrix_vector_product - ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) + ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const ResScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; - Index diagSize = (std::min)(_rows,_cols); - Index rows = IsLower ? _rows : diagSize; - Index cols = IsLower ? diagSize : _cols; + Index diagSize = (std::min)(rows_,cols_); + Index rows = IsLower ? rows_ : diagSize; + Index cols = IsLower ? diagSize : cols_; typedef Map, 0, OuterStride<> > LhsMap; - const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if::type cjLhs(lhs); typedef Map > RhsMap; - const RhsMap rhs(_rhs,cols); + const RhsMap rhs(rhs_,cols); typename conj_expr_if::type cjRhs(rhs); typedef Map, 0, InnerStride<> > ResMap; - ResMap res(_res,rows,InnerStride<>(resIncr)); + ResMap res(res_,rows,InnerStride<>(resIncr)); typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 3d47a2b94..0f8d3a1da 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -50,18 +50,18 @@ struct triangular_matrix_vector_product_trmv : #define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \ template \ struct triangular_matrix_vector_product { \ - static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \ + const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \ triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ } \ }; \ template \ struct triangular_matrix_vector_product { \ - static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \ + const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \ triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ } \ }; @@ -81,23 +81,23 @@ struct triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ return; \ }\ - Index size = (std::min)(_rows,_cols); \ - Index rows = IsLower ? _rows : size; \ - Index cols = IsLower ? size : _cols; \ + Index size = (std::min)(rows_,cols_); \ + Index rows = IsLower ? rows_ : size; \ + Index cols = IsLower ? size : cols_; \ \ typedef VectorX##EIGPREFIX VectorRhs; \ EIGTYPE *x, *y;\ \ /* Set x*/ \ - Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + Map > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \ VectorRhs x_tmp; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ @@ -121,24 +121,24 @@ struct triangular_matrix_vector_product_trmv(rows-size); \ n = convert_index(size); \ } \ else { \ x += size; \ - y = _res; \ - a = _lhs + size*lda; \ + y = res_; \ + a = lhs_ + size*lda; \ m = convert_index(size); \ n = convert_index(cols-size); \ } \ @@ -170,23 +170,23 @@ struct triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ return; \ }\ - Index size = (std::min)(_rows,_cols); \ - Index rows = IsLower ? _rows : size; \ - Index cols = IsLower ? size : _cols; \ + Index size = (std::min)(rows_,cols_); \ + Index rows = IsLower ? rows_ : size; \ + Index cols = IsLower ? size : cols_; \ \ typedef VectorX##EIGPREFIX VectorRhs; \ EIGTYPE *x, *y;\ \ /* Set x*/ \ - Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + Map > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \ VectorRhs x_tmp; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ @@ -210,24 +210,24 @@ struct triangular_matrix_vector_product_trmv(rows-size); \ n = convert_index(size); \ } \ else { \ x += size; \ - y = _res; \ - a = _lhs + size; \ + y = res_; \ + a = lhs_ + size; \ m = convert_index(size); \ n = convert_index(cols-size); \ } \ diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h index 88820a48f..25ce404b8 100644 --- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h @@ -90,9 +90,9 @@ struct sparse_sparse_product_with_pruning_selector::type _res(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, _res, tolerance); - res.swap(_res); + typename remove_all::type res_(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, res_, tolerance); + res.swap(res_); } }; @@ -104,9 +104,9 @@ struct sparse_sparse_product_with_pruning_selector SparseTemporaryType; - SparseTemporaryType _res(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, _res, tolerance); - res = _res; + SparseTemporaryType res_(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, res_, tolerance); + res = res_; } }; @@ -117,9 +117,9 @@ struct sparse_sparse_product_with_pruning_selector::type _res(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl(rhs, lhs, _res, tolerance); - res.swap(_res); + typename remove_all::type res_(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl(rhs, lhs, res_, tolerance); + res.swap(res_); } }; @@ -137,9 +137,9 @@ struct sparse_sparse_product_with_pruning_selector SparseTemporaryType; -// SparseTemporaryType _res(res.cols(), res.rows()); -// sparse_sparse_product_with_pruning_impl(rhs, lhs, _res); -// res = _res.transpose(); +// SparseTemporaryType res_(res.cols(), res.rows()); +// sparse_sparse_product_with_pruning_impl(rhs, lhs, res_); +// res = res_.transpose(); } }; diff --git a/test/main.h b/test/main.h index a5b68e2e8..19bbf1b81 100644 --- a/test/main.h +++ b/test/main.h @@ -111,6 +111,9 @@ struct imag {}; // `I` may be defined by complex.h: #define I FORBIDDEN_IDENTIFIER +// _res is defined by resolv.h +#define _res FORBIDDEN_IDENTIFIER + // Unit tests calling Eigen's blas library must preserve the default blocking size // to avoid troubles. #ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS -- GitLab From 7176ae16238ded7fb5ed30a7f5215825b3abd134 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Mon, 16 Oct 2023 15:38:25 -0700 Subject: [PATCH 250/266] Make 3.4.1 compatible with c++03 --- Eigen/src/Core/util/Meta.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index 8751ed23e..b7635f985 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -470,9 +470,8 @@ template struct array_size > { */ #if EIGEN_COMP_CXXVER < 20 template -EIGEN_CONSTEXPR auto index_list_size(const T& x) { - using R = std::common_type_t>; - return static_cast(x.size()); +EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T& x) { + return static_cast(x.size()); } template -- GitLab From 1217390db41285d95e7c7be604e36666bff02a79 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 25 Oct 2023 20:45:11 +0000 Subject: [PATCH 251/266] Fix windows+CUDA builds --- Eigen/src/Core/util/Macros.h | 2 +- unsupported/test/cxx11_tensor_of_float16_gpu.cu | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index eebfd901d..961097baf 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -1198,7 +1198,7 @@ namespace Eigen { #define EIGEN_USING_STD(FUNC) using std::FUNC; #endif -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || (EIGEN_COMP_MSVC == 1900 && EIGEN_COMP_NVCC)) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1916 || (EIGEN_COMP_MSVC == 1916 && EIGEN_COMP_NVCC)) // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary, // otherwise we get duplicate definition errors // For later MSVC versions, we require explicit operator= definition, otherwise we get diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu index 30bcc1d28..e11782a79 100644 --- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu @@ -113,7 +113,7 @@ void test_gpu_unary() { gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); gpu_res_float.device(gpu_device) = gpu_float.abs(); - gpu_res_half.device(gpu_device) = gpu_float.cast().abs().cast(); + gpu_res_half.device(gpu_device) = gpu_float.cast().abs().template cast(); Tensor half_prec(num_elem); Tensor full_prec(num_elem); @@ -154,7 +154,7 @@ void test_gpu_elementwise() { gpu_float1.device(gpu_device) = gpu_float1.random(); gpu_float2.device(gpu_device) = gpu_float2.random(); gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; - gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).cast(); + gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).template cast(); Tensor half_prec(num_elem); Tensor full_prec(num_elem); @@ -444,8 +444,8 @@ void test_gpu_forced_evals() { gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); gpu_res_float.device(gpu_device) = gpu_float.abs(); - gpu_res_half1.device(gpu_device) = gpu_float.cast().abs().eval().cast(); - gpu_res_half2.device(gpu_device) = gpu_float.cast().abs().broadcast(no_bcast).eval().cast(); + gpu_res_half1.device(gpu_device) = gpu_float.cast().abs().eval().template cast(); + gpu_res_half2.device(gpu_device) = gpu_float.cast().abs().broadcast(no_bcast).eval().template cast(); Tensor half_prec1(num_elem); Tensor half_prec2(num_elem); -- GitLab From e734787bb7b98204d7bfeb5bc80cbc6b65d77b2f Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 31 Jan 2023 19:40:24 +0000 Subject: [PATCH 252/266] Fix pre-POWER8_VECTOR bugs in pcmp_lt and pnegate and reactivate psqrt. (cherry picked from commit 4a58f30aa083d2f5753465a36730ca7734b483be) --- Eigen/src/Core/arch/AltiVec/MathFunctions.h | 50 +++++++++++++-------- Eigen/src/Core/arch/AltiVec/PacketMath.h | 44 ++++++++++++------ 2 files changed, 63 insertions(+), 31 deletions(-) diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 3a7a32936..2b7c204e3 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -40,24 +40,9 @@ Packet4f pcos(const Packet4f& _x) return pcos_float(_x); } -#ifndef EIGEN_COMP_CLANG -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f prsqrt(const Packet4f& x) -{ - return vec_rsqrt(x); -} -#endif - #ifdef __VSX__ -#ifndef EIGEN_COMP_CLANG -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d prsqrt(const Packet2d& x) -{ - return vec_rsqrt(x); -} -#endif -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt(const Packet4f& x) { return vec_sqrt(x); @@ -69,12 +54,41 @@ Packet2d psqrt(const Packet2d& x) return vec_sqrt(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f prsqrt(const Packet4f& x) +{ + return pset1(1.0f) / psqrt(x); +// vec_rsqrt returns different results from the generic version +// return vec_rsqrt(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet2d prsqrt(const Packet2d& x) +{ + return pset1(1.0) / psqrt(x); +// vec_rsqrt returns different results from the generic version +// return vec_rsqrt(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp(const Packet2d& _x) { return pexp_double(_x); } -#endif + +template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); +} + +#endif // __VSX__ // Hyperbolic Tangent function. template <> diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 2a440545b..528f995d3 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -786,8 +786,22 @@ template<> EIGEN_STRONG_INLINE Packet8us psub (const Packet8us& a, template<> EIGEN_STRONG_INLINE Packet16c psub (const Packet16c& a, const Packet16c& b) { return a - b; } template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return vec_xor(a, p4f_MZERO); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return p4i_ZERO - a; +#endif +} template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } @@ -865,7 +879,10 @@ template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, con template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmple(a,b)); } +// To fix bug with vec_cmplt on older versions +#if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmplt(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmpeq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { Packet4f c = reinterpret_cast(vec_cmpge(a,b)); @@ -1341,16 +1358,6 @@ template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, con BF16_TO_F32_BINARY_OP_WRAPPER(psub, a, b); } -template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); -} -template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); -} -template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); -} - template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { return pldexp_generic(a,exponent); } @@ -2304,7 +2311,11 @@ template<> struct packet_traits : default_packet_traits HasLog = 0, HasExp = 1, HasSqrt = 1, +#if !EIGEN_COMP_CLANG HasRsqrt = 1, +#else + HasRsqrt = 0, +#endif HasRound = 1, HasFloor = 1, HasCeil = 1, @@ -2393,7 +2404,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return vec_xor(a, p2d_MZERO); +#endif +} template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } -- GitLab From cf207eacd53dd35c65a4cde029fd36c960595ba2 Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Sat, 31 Dec 2022 04:52:36 +0000 Subject: [PATCH 253/266] Patch SparseLU (cherry picked from commit a8bab0d8ae0601392287526840ddcb1af3bb85a6) --- Eigen/SparseLU | 2 - Eigen/src/SparseLU/SparseLU.h | 1 + Eigen/src/SparseLU/SparseLU_gemm_kernel.h | 280 ---------------------- Eigen/src/SparseLU/SparseLU_kernel_bmod.h | 3 +- Eigen/src/SparseLU/SparseLU_panel_bmod.h | 3 +- 5 files changed, 3 insertions(+), 286 deletions(-) delete mode 100644 Eigen/src/SparseLU/SparseLU_gemm_kernel.h diff --git a/Eigen/SparseLU b/Eigen/SparseLU index 37c4a5c5a..047cf0dca 100644 --- a/Eigen/SparseLU +++ b/Eigen/SparseLU @@ -25,8 +25,6 @@ #include "src/Core/util/DisableStupidWarnings.h" -#include "src/SparseLU/SparseLU_gemm_kernel.h" - #include "src/SparseLU/SparseLU_Structs.h" #include "src/SparseLU/SparseLU_SupernodalMatrix.h" #include "src/SparseLU/SparseLUImpl.h" diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index 761b95c98..6eb79502f 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -38,6 +38,7 @@ public: SparseLUTransposeView() : APIBase(), m_sparseLU(NULL) {} SparseLUTransposeView(const SparseLUTransposeView& view) : APIBase() { this->m_sparseLU = view.m_sparseLU; + this->m_isInitialized = view.m_isInitialized; } void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;} void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;} diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h deleted file mode 100644 index e37c2fe0d..000000000 --- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ /dev/null @@ -1,280 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2012 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H -#define EIGEN_SPARSELU_GEMM_KERNEL_H - -namespace Eigen { - -namespace internal { - - -/** \internal - * A general matrix-matrix product kernel optimized for the SparseLU factorization. - * - A, B, and C must be column major - * - lda and ldc must be multiples of the respective packet size - * - C must have the same alignment as A - */ -template -EIGEN_DONT_INLINE -void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc) -{ - using namespace Eigen::internal; - - typedef typename packet_traits::type Packet; - enum { - NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - PacketSize = packet_traits::size, - PM = 8, // peeling in M - RN = 2, // register blocking - RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking - BM = 4096/sizeof(Scalar), // number of rows of A-C per chunk - SM = PM*PacketSize // step along M - }; - Index d_end = (d/RK)*RK; // number of columns of A (rows of B) suitable for full register blocking - Index n_end = (n/RN)*RN; // number of columns of B-C suitable for processing RN columns at once - Index i0 = internal::first_default_aligned(A,m); - - eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m))); - - // handle the non aligned rows of A and C without any optimization: - for(Index i=0; i(BM, m-ib); // actual number of rows - Index actual_b_end1 = (actual_b/SM)*SM; // actual number of rows suitable for peeling - Index actual_b_end2 = (actual_b/PacketSize)*PacketSize; // actual number of rows suitable for vectorization - - // Let's process two columns of B-C at once - for(Index j=0; j(Bc0[0]); } - { b10 = pset1(Bc0[1]); } - if(RK==4) { b20 = pset1(Bc0[2]); } - if(RK==4) { b30 = pset1(Bc0[3]); } - { b01 = pset1(Bc1[0]); } - { b11 = pset1(Bc1[1]); } - if(RK==4) { b21 = pset1(Bc1[2]); } - if(RK==4) { b31 = pset1(Bc1[3]); } - - Packet a0, a1, a2, a3, c0, c1, t0, t1; - - const Scalar* A0 = A+ib+(k+0)*lda; - const Scalar* A1 = A+ib+(k+1)*lda; - const Scalar* A2 = A+ib+(k+2)*lda; - const Scalar* A3 = A+ib+(k+3)*lda; - - Scalar* C0 = C+ib+(j+0)*ldc; - Scalar* C1 = C+ib+(j+1)*ldc; - - a0 = pload(A0); - a1 = pload(A1); - if(RK==4) - { - a2 = pload(A2); - a3 = pload(A3); - } - else - { - // workaround "may be used uninitialized in this function" warning - a2 = a3 = a0; - } - -#define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);} -#define WORK(I) \ - c0 = pload(C0+i+(I)*PacketSize); \ - c1 = pload(C1+i+(I)*PacketSize); \ - KMADD(c0, a0, b00, t0) \ - KMADD(c1, a0, b01, t1) \ - a0 = pload(A0+i+(I+1)*PacketSize); \ - KMADD(c0, a1, b10, t0) \ - KMADD(c1, a1, b11, t1) \ - a1 = pload(A1+i+(I+1)*PacketSize); \ - if(RK==4){ KMADD(c0, a2, b20, t0) }\ - if(RK==4){ KMADD(c1, a2, b21, t1) }\ - if(RK==4){ a2 = pload(A2+i+(I+1)*PacketSize); }\ - if(RK==4){ KMADD(c0, a3, b30, t0) }\ - if(RK==4){ KMADD(c1, a3, b31, t1) }\ - if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ - pstore(C0+i+(I)*PacketSize, c0); \ - pstore(C1+i+(I)*PacketSize, c1) - - // process rows of A' - C' with aggressive vectorization and peeling - for(Index i=0; i0) - { - const Scalar* Bc0 = B+(n-1)*ldb; - - for(Index k=0; k(Bc0[0]); - b10 = pset1(Bc0[1]); - if(RK==4) b20 = pset1(Bc0[2]); - if(RK==4) b30 = pset1(Bc0[3]); - - Packet a0, a1, a2, a3, c0, t0/*, t1*/; - - const Scalar* A0 = A+ib+(k+0)*lda; - const Scalar* A1 = A+ib+(k+1)*lda; - const Scalar* A2 = A+ib+(k+2)*lda; - const Scalar* A3 = A+ib+(k+3)*lda; - - Scalar* C0 = C+ib+(n_end)*ldc; - - a0 = pload(A0); - a1 = pload(A1); - if(RK==4) - { - a2 = pload(A2); - a3 = pload(A3); - } - else - { - // workaround "may be used uninitialized in this function" warning - a2 = a3 = a0; - } - -#define WORK(I) \ - c0 = pload(C0+i+(I)*PacketSize); \ - KMADD(c0, a0, b00, t0) \ - a0 = pload(A0+i+(I+1)*PacketSize); \ - KMADD(c0, a1, b10, t0) \ - a1 = pload(A1+i+(I+1)*PacketSize); \ - if(RK==4){ KMADD(c0, a2, b20, t0) }\ - if(RK==4){ a2 = pload(A2+i+(I+1)*PacketSize); }\ - if(RK==4){ KMADD(c0, a3, b30, t0) }\ - if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ - pstore(C0+i+(I)*PacketSize, c0); - - // aggressive vectorization and peeling - for(Index i=0; i0) - { - for(Index j=0; j1 ? Aligned : 0 - }; - typedef Map, Alignment > MapVector; - typedef Map, Alignment > ConstMapVector; - if(rd==1) MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b); - - else if(rd==2) MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) - + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b); - - else MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) - + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b) - + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b); - } - } - - } // blocking on the rows of A and C -} -#undef KMADD - -} // namespace internal - -} // namespace Eigen - -#endif // EIGEN_SPARSELU_GEMM_KERNEL_H diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h index 8c1b3e8bc..7a101ea0c 100644 --- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h @@ -69,8 +69,7 @@ EIGEN_DONT_INLINE void LU_kernel_bmod::run(const Index seg Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize; Map, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) ); - l.setZero(); - internal::sparselu_gemm(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride()); + l.noalias() = B * u; // Scatter tempv[] into SPA dense[] as a temporary storage isub = lptr + no_zeros; diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h index f052001c8..92cdb0e45 100644 --- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h @@ -148,8 +148,7 @@ void SparseLUImpl::panel_bmod(const Index m, const Index w, Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize; MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl)); - L.setZero(); - internal::sparselu_gemm(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride()); + L.noalias() = B * U; // scatter U and L u_col = 0; -- GitLab From bae907b8f6078b1df290729eef946360315bd312 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Sat, 4 Nov 2023 16:37:06 -0700 Subject: [PATCH 254/266] Update version to 3.4.1 Tests all pass: https://gitlab.com/libeigen/eigen_ci_cross_testing/-/pipelines/1060764169 --- Eigen/src/Core/util/Macros.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 961097baf..cd2dbe77a 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -17,7 +17,7 @@ #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 4 -#define EIGEN_MINOR_VERSION 0 +#define EIGEN_MINOR_VERSION 1 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ -- GitLab From fc5575264ffbb14a7c8b13e57fbe85b8f30b6745 Mon Sep 17 00:00:00 2001 From: Silvio Traversaro Date: Fri, 10 Nov 2023 04:03:11 +0000 Subject: [PATCH 255/266] Backport "disambiguate overloads for empty index list" to 3.4 branch --- unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index a4922e913..75b919839 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -261,7 +261,7 @@ template struct IndexPair { #ifdef EIGEN_HAS_SFINAE namespace internal { - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array customIndices2Array(IndexType& idx, numeric_list) { return { static_cast(idx[First]), static_cast(idx[Is])... }; -- GitLab From 2e3f1d80449940e0951e85182d0bbd2b1a114b00 Mon Sep 17 00:00:00 2001 From: arthurfeeney Date: Sun, 16 Jan 2022 11:28:31 -0600 Subject: [PATCH 256/266] Fix implicit conversion warning in GEBP kernel's packing (cherry picked from commit 937c3d73cbf55b6edc658decf7931b90ede1daef) --- .../src/Core/products/GeneralBlockPanelKernel.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index f35b760c1..4c649a281 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -2269,8 +2269,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs0) { Index remaining_rows = rows-i; @@ -2290,21 +2290,21 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } else if (HasHalf && psize == HalfPacketSize) { gone_half = true; PacketBlock kernel_half; - for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel_half); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); } else if (HasQuarter && psize == QuarterPacketSize) { gone_quarter = true; PacketBlock kernel_quarter; - for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel_quarter); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); } } count += psize*pack; -- GitLab From 7c6020e4246593b45f819beaf20a27e82f410f97 Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Fri, 21 Jul 2023 03:47:40 +0000 Subject: [PATCH 257/266] Fix -Waggressive-loop-optimizations (cherry picked from commit 4e9e493b4abc57dba377fc326082b40d08193619) --- Eigen/src/Core/products/GeneralMatrixVector.h | 32 +++++++++++-------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index dfb6aebce..b6579cc99 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -359,6 +359,10 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)), c7 = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); @@ -393,7 +396,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)), c3 = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); @@ -436,7 +439,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)), c1 = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); @@ -465,7 +468,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)); ResPacketHalf c0_h = pset1(ResScalar(0)); ResPacketQuarter c0_q = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); c0 = pcj.pmadd(lhs.template load(i,j),b0,c0); } ResScalar cc0 = predux(c0); if (HasHalf) { - for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) + for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf) { RhsPacketHalf b0 = rhs.template load(j,0); c0_h = pcj_half.pmadd(lhs.template load(i,j),b0,c0_h); @@ -496,14 +500,14 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(j,0); c0_q = pcj_quarter.pmadd(lhs.template load(i,j),b0,c0_q); } cc0 += predux(c0_q); } - for(; j Date: Mon, 20 Nov 2023 17:26:39 +0000 Subject: [PATCH 258/266] Gemv microoptimization (cherry picked from commit d1b03fb5c91dcf0e48555bba8387201a0b1425d9) --- Eigen/src/Core/products/GeneralMatrixVector.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index b6579cc99..0b622c8da 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -359,9 +359,9 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product Date: Tue, 21 Nov 2023 03:49:04 +0000 Subject: [PATCH 259/266] Update file GeneralMatrixVector.h (cherry picked from commit 283dec7f257d463ff66d688cda17f78118eb3caa) --- Eigen/src/Core/products/GeneralMatrixVector.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index 0b622c8da..b8fc5a922 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -359,9 +359,10 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::type; + const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize); + const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf); + const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter); Index i=0; for(; i Date: Fri, 24 Nov 2023 19:42:54 +0000 Subject: [PATCH 260/266] replace using with typedef --- Eigen/src/Core/products/GeneralMatrixVector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index b8fc5a922..974a04705 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -359,7 +359,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::type; + typedef typename make_unsigned::type UnsignedIndex; const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize); const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf); const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter); -- GitLab From 4be28702672e5b56cb2b6d840a212ef139a4e9f0 Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Mon, 27 Nov 2023 10:08:18 -0800 Subject: [PATCH 261/266] Only apply ASM work-around for min/max on GNUC strict. Fixes #2742. --- Eigen/src/Core/arch/SSE/PacketMath.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index 9feca1ccc..b485e0df1 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -444,7 +444,7 @@ template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packe template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -463,7 +463,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -494,7 +494,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, -- GitLab From b8f894947a080d072bec0671feaa948838666fe5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antonio=20S=C3=A1nchez?= Date: Mon, 11 Dec 2023 21:03:09 +0000 Subject: [PATCH 262/266] Add internal ctz/clz implementation. (cherry picked from commit 75e273afcc86c4580aae12fb4e6e68c252cc2af0) --- Eigen/src/Core/MathFunctions.h | 154 +++++++++++++++++++++++++++++++-- test/CMakeLists.txt | 25 +++--- test/clz.cpp | 74 ++++++++++++++++ 3 files changed, 234 insertions(+), 19 deletions(-) create mode 100644 test/clz.cpp diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 48184cc6a..299b072d3 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -877,13 +877,153 @@ struct meta_floor_log2 // no value, error at compile time }; -template -struct random_default_impl -{ - static inline Scalar run(const Scalar& x, const Scalar& y) - { - if (y <= x) - return x; +template +struct count_bits_impl { + static_assert(std::is_integral::value && std::is_unsigned::value, + "BitsType must be an unsigned integer"); + + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + int n = CHAR_BIT * sizeof(BitsType); + int shift = n / 2; + while (bits > 0 && shift > 0) { + BitsType y = bits >> shift; + if (y > 0) { + n -= shift; + bits = y; + } + shift /= 2; + } + if (shift == 0) { + --n; + } + return n; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + int n = CHAR_BIT * sizeof(BitsType); + int shift = n / 2; + while (bits > 0 && shift > 0) { + BitsType y = bits << shift; + if (y > 0) { + n -= shift; + bits = y; + } + shift /= 2; + } + if (shift == 0) { + --n; + } + return n; + } +}; + +// Count leading zeros. +template +EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + return count_bits_impl::clz(bits); +} + +// Count trailing zeros. +template +EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return count_bits_impl::ctz(bits); +} + +#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG + +template +struct count_bits_impl> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + static const int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clz(static_cast(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return bits == 0 ? kNumBits : __builtin_ctz(static_cast(bits)); + } +}; + +template +struct count_bits_impl< + BitsType, std::enable_if_t> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + static const int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clzl(static_cast(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return bits == 0 ? kNumBits : __builtin_ctzl(static_cast(bits)); + } +}; + +template +struct count_bits_impl> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + static const int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clzll(static_cast(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return bits == 0 ? kNumBits : __builtin_ctzll(static_cast(bits)); + } +}; + +#elif EIGEN_COMP_MSVC + +template +struct count_bits_impl> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + static const int kLeadingBitsOffset = static_cast((sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT); + unsigned long out; + _BitScanReverse(&out, static_cast(bits)); + return bits == 0 ? kNumBits : static_cast(out - kLeadingBitsOffset); + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + unsigned long out; + _BitScanForward(&out, static_cast(bits)); + return bits == 0 ? kNumBits : static_cast(out); + } +}; + +#ifdef _WIN64 + +template +struct count_bits_impl< + BitsType, std::enable_if_t> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static_assert(std::is_integral::value, "BitsType must be a built-in integer"); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + static const int kLeadingBitsOffset = static_cast((sizeof(__int64) - sizeof(BitsType)) * CHAR_BIT); + unsigned long out; + _BitScanReverse64(&out, static_cast(bits)); + return bits == 0 ? kNumBits : static_cast(out - kLeadingBitsOffset); + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + unsigned long out; + _BitScanForward64(&out, static_cast(bits)); + return bits == 0 ? kNumBits : static_cast(out); + } +}; + +#endif // _WIN64 + +#endif // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG + +template +struct random_default_impl { + static inline Scalar run(const Scalar& x, const Scalar& y) { + if (y <= x) return x; // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself. typedef typename make_unsigned::type ScalarU; // ScalarX is the widest of ScalarU and unsigned int. diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4aff37797..dbd4bc618 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -48,7 +48,7 @@ if(CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK) set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ") - + ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ") @@ -61,7 +61,7 @@ if(UMFPACK_FOUND AND EIGEN_BUILD_BLAS) set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ") - + ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ") @@ -74,7 +74,7 @@ if(KLU_FOUND AND EIGEN_BUILD_BLAS) set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ") - + ei_add_test(klu_support "" "${KLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ") @@ -87,7 +87,7 @@ if(SuperLU_FOUND AND EIGEN_BUILD_BLAS) set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "SuperLU, ") - + ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "SuperLU, ") @@ -160,6 +160,7 @@ endif() set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official") add_custom_target(BuildOfficial) +ei_add_test(clz) ei_add_test(rand) ei_add_test(meta) ei_add_test(numext) @@ -383,7 +384,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") if(EIGEN_TEST_CUDA_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") @@ -401,11 +402,11 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}") cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") endif() - + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - + ei_add_test(gpu_basic) - + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() @@ -418,8 +419,8 @@ if (EIGEN_TEST_HIP) set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") if (EXISTS ${HIP_PATH}) - - list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) + + list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) find_package(HIP REQUIRED) if (HIP_FOUND) @@ -433,12 +434,12 @@ if (EIGEN_TEST_HIP) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(gpu_basic) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) - + elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia")) message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") else () message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") - endif() + endif() endif() else () message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") diff --git a/test/clz.cpp b/test/clz.cpp new file mode 100644 index 000000000..1d08b4715 --- /dev/null +++ b/test/clz.cpp @@ -0,0 +1,74 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2023 The Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +template +int ref_clz(T val) { + static const int kNumBits = sizeof(T) * CHAR_BIT; + T kMsbMask = T(1) << (kNumBits - 1); + int z = 0; + for (; z < kNumBits && ((val & kMsbMask) == 0); ++z) { + val <<= 1; + } + return z; +} + +template +int ref_ctz(T val) { + static const int kNumBits = sizeof(T) * CHAR_BIT; + T kLsbMask = T(1); + int z = 0; + for (; z < kNumBits && ((val & kLsbMask) == 0); ++z) { + val >>= 1; + } + return z; +} + +template +void test_clz_ctz() { + T step = sizeof(T) <= 2 ? 1 : (Eigen::NumTraits::highest() / (T(1) << 16)); + T iters = Eigen::NumTraits::highest() / step; + for (T i = 0; i < iters; ++i) { + T val = i * step; + int expected_clz = ref_clz(val); + int actual_clz = Eigen::internal::clz(val); + VERIFY(expected_clz == actual_clz); + + int expected_ctz = ref_ctz(val); + int actual_ctz = Eigen::internal::ctz(val); + VERIFY(expected_ctz == actual_ctz); + } +} + +template +void test_clz_ctz_random() { + for (int i = 0; i < 1024 * 1024; ++i) { + T val = Eigen::internal::random(); + int expected_clz = ref_clz(val); + int actual_clz = Eigen::internal::clz(val); + VERIFY(expected_clz == actual_clz); + + int expected_ctz = ref_ctz(val); + int actual_ctz = Eigen::internal::ctz(val); + VERIFY(expected_ctz == actual_ctz); + } +} + +EIGEN_DECLARE_TEST(clz) { + CALL_SUBTEST_1(test_clz_ctz()); + CALL_SUBTEST_2(test_clz_ctz()); + CALL_SUBTEST_3(test_clz_ctz()); + CALL_SUBTEST_4(test_clz_ctz()); + + for (int i = 0; i < g_repeat; i++) { + test_clz_ctz_random(); + test_clz_ctz_random(); + } +} -- GitLab From bd57b99f44874a64f98d35adc28a8daec41f0873 Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Wed, 13 Dec 2023 03:33:49 +0000 Subject: [PATCH 263/266] fix msvc clz (cherry picked from commit 2c4541f735d3dad2ce312bf785c366668d646f3b) --- Eigen/src/Core/MathFunctions.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 299b072d3..bda28fbfd 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -982,10 +982,9 @@ struct count_bits_impl(sizeof(BitsType) * CHAR_BIT); static_assert(std::is_integral::value, "BitsType must be a built-in integer"); static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { - static const int kLeadingBitsOffset = static_cast((sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT); unsigned long out; _BitScanReverse(&out, static_cast(bits)); - return bits == 0 ? kNumBits : static_cast(out - kLeadingBitsOffset); + return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast(out); } static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { @@ -1003,10 +1002,9 @@ struct count_bits_impl< static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); static_assert(std::is_integral::value, "BitsType must be a built-in integer"); static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { - static const int kLeadingBitsOffset = static_cast((sizeof(__int64) - sizeof(BitsType)) * CHAR_BIT); unsigned long out; _BitScanReverse64(&out, static_cast(bits)); - return bits == 0 ? kNumBits : static_cast(out - kLeadingBitsOffset); + return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast(out); } static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { -- GitLab From ebf968b27230195f117e7fd6a696ffe6b9e0973e Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Wed, 20 Dec 2023 14:18:48 -0800 Subject: [PATCH 264/266] Remove c++11 from ctz/clz --- Eigen/src/Core/MathFunctions.h | 36 +++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index bda28fbfd..764c41c97 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -879,10 +879,10 @@ struct meta_floor_log2 template struct count_bits_impl { - static_assert(std::is_integral::value && std::is_unsigned::value, - "BitsType must be an unsigned integer"); - static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT( + is_integral::value && !NumTraits::IsSigned, + THIS_TYPE_IS_NOT_SUPPORTED); int n = CHAR_BIT * sizeof(BitsType); int shift = n / 2; while (bits > 0 && shift > 0) { @@ -900,6 +900,9 @@ struct count_bits_impl { } static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT( + is_integral::value && !NumTraits::IsSigned, + THIS_TYPE_IS_NOT_SUPPORTED); int n = CHAR_BIT * sizeof(BitsType); int shift = n / 2; while (bits > 0 && shift > 0) { @@ -932,45 +935,48 @@ EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { #if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG template -struct count_bits_impl> { +struct count_bits_impl::type> { static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); - static_assert(std::is_integral::value, "BitsType must be a built-in integer"); static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); static const int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT; return bits == 0 ? kNumBits : __builtin_clz(static_cast(bits)) - kLeadingBitsOffset; } static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); return bits == 0 ? kNumBits : __builtin_ctz(static_cast(bits)); } }; template struct count_bits_impl< - BitsType, std::enable_if_t> { + BitsType, typename enable_if::type> { static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); - static_assert(std::is_integral::value, "BitsType must be a built-in integer"); static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); static const int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT; return bits == 0 ? kNumBits : __builtin_clzl(static_cast(bits)) - kLeadingBitsOffset; } static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); return bits == 0 ? kNumBits : __builtin_ctzl(static_cast(bits)); } }; template -struct count_bits_impl> { +struct count_bits_impl::type> { static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); - static_assert(std::is_integral::value, "BitsType must be a built-in integer"); static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); static const int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT; return bits == 0 ? kNumBits : __builtin_clzll(static_cast(bits)) - kLeadingBitsOffset; } static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); return bits == 0 ? kNumBits : __builtin_ctzll(static_cast(bits)); } }; @@ -978,16 +984,17 @@ struct count_bits_impl -struct count_bits_impl> { +struct count_bits_impl::type> { static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); - static_assert(std::is_integral::value, "BitsType must be a built-in integer"); static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); unsigned long out; _BitScanReverse(&out, static_cast(bits)); return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast(out); } static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); unsigned long out; _BitScanForward(&out, static_cast(bits)); return bits == 0 ? kNumBits : static_cast(out); @@ -998,16 +1005,17 @@ struct count_bits_impl struct count_bits_impl< - BitsType, std::enable_if_t> { + BitsType, typename enable_if::type> { static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); - static_assert(std::is_integral::value, "BitsType must be a built-in integer"); static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); unsigned long out; _BitScanReverse64(&out, static_cast(bits)); return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast(out); } static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); unsigned long out; _BitScanForward64(&out, static_cast(bits)); return bits == 0 ? kNumBits : static_cast(out); -- GitLab From 25270e35dbfb9d407175a321707a3b51a079588d Mon Sep 17 00:00:00 2001 From: Charles Schlosser Date: Thu, 21 Dec 2023 00:57:21 +0000 Subject: [PATCH 265/266] Fix compiler warnings in 3.4 --- Eigen/src/Core/Block.h | 6 +++--- Eigen/src/Core/StlIterators.h | 1 - Eigen/src/SVD/UpperBidiagonalization.h | 3 ++- test/AnnoyingScalar.h | 2 +- test/block.cpp | 11 ++++++----- test/stdlist_overload.cpp | 2 +- test/stdvector.cpp | 4 ++-- test/vectorization_logic.cpp | 4 ++-- unsupported/Eigen/FFT | 19 ++++++++++--------- 9 files changed, 27 insertions(+), 25 deletions(-) diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 5932a9093..9d89b60cf 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -355,7 +355,7 @@ class BlockImpl_dense */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index i) - : Base(add_to_nullable_pointer(xpr.data(), + : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(), i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride())), BlockRows==1 ? 1 : xpr.rows(), @@ -371,7 +371,7 @@ class BlockImpl_dense */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) - : Base(add_to_nullable_pointer(xpr.data(), + : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(), xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol))), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { @@ -384,7 +384,7 @@ class BlockImpl_dense BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) - : Base(add_to_nullable_pointer(xpr.data(), + : Base((blockRows == 0 || blockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(), xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), blockRows, blockCols), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) diff --git a/Eigen/src/Core/StlIterators.h b/Eigen/src/Core/StlIterators.h index 5db3f605b..09041db1d 100644 --- a/Eigen/src/Core/StlIterators.h +++ b/Eigen/src/Core/StlIterators.h @@ -196,7 +196,6 @@ public: pointer_based_stl_iterator() EIGEN_NO_THROW : m_ptr(0) {} pointer_based_stl_iterator(XprType& xpr, Index index) EIGEN_NO_THROW : m_incr(xpr.innerStride()) { - eigen_assert(xpr.data() != NULL || index == 0 || m_incr.value() == 0); m_ptr = xpr.data() + index * m_incr.value(); } diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index 5afebef9e..a5b2f60d2 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h @@ -161,7 +161,8 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename NumTraits::Literal Literal; - enum { StorageOrder = (traits::Flags & RowMajorBit) ? RowMajor : ColMajor }; + static const int StorageOrder = + (traits::Flags & RowMajorBit) ? RowMajor : ColMajor; typedef InnerStride ColInnerStride; typedef InnerStride RowInnerStride; typedef Ref, 0, ColInnerStride> SubColumnType; diff --git a/test/AnnoyingScalar.h b/test/AnnoyingScalar.h index 7ace083c5..b62188727 100644 --- a/test/AnnoyingScalar.h +++ b/test/AnnoyingScalar.h @@ -126,7 +126,7 @@ template<> struct NumTraits : NumTraits { enum { - RequireInitialization = 1, + RequireInitialization = 1 }; typedef AnnoyingScalar Real; typedef AnnoyingScalar Nested; diff --git a/test/block.cpp b/test/block.cpp index 84124aba6..667a3be39 100644 --- a/test/block.cpp +++ b/test/block.cpp @@ -143,11 +143,12 @@ template void block(const MatrixType& m) // check that linear acccessors works on blocks m1 = m1_copy; - if((MatrixType::Flags&RowMajorBit)==0) - VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1+c1*rows), m1(r1,c1)); - else - VERIFY_IS_EQUAL(m1.topRows(r1).coeff(c1+r1*cols), m1(r1,c1)); - + if (c1 > 0 && r1 > 0) { + if ((MatrixType::Flags & RowMajorBit) == 0) + VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1 + c1 * rows), m1(r1, c1)); + else + VERIFY_IS_EQUAL(m1.topRows(r1).coeff(c1 + r1 * cols), m1(r1, c1)); + } // now test some block-inside-of-block. diff --git a/test/stdlist_overload.cpp b/test/stdlist_overload.cpp index a78516e24..843e28c3c 100644 --- a/test/stdlist_overload.cpp +++ b/test/stdlist_overload.cpp @@ -63,7 +63,7 @@ void check_stdlist_matrix(const MatrixType& m) ++itw; } - v.resize(21); + v.resize(21, MatrixType::Zero(rows, cols)); set(v, 20, x); VERIFY_IS_APPROX(*get(v, 20), x); v.resize(22,y); diff --git a/test/stdvector.cpp b/test/stdvector.cpp index 18de240c6..9c023d656 100644 --- a/test/stdvector.cpp +++ b/test/stdvector.cpp @@ -52,7 +52,7 @@ void check_stdvector_transform(const TransformType&) { typedef typename TransformType::MatrixType MatrixType; TransformType x(MatrixType::Random()), y(MatrixType::Random()); - std::vector > v(10), w(20, y); + std::vector > v(10, TransformType(MatrixType::Zero())), w(20, y); v[5] = x; w[6] = v[5]; VERIFY_IS_APPROX(w[6], v[5]); @@ -124,7 +124,7 @@ void std_vector_gcc_warning() { typedef Eigen::Vector3f T; std::vector > v; - v.push_back(T()); + v.push_back(T::Zero()); } EIGEN_DECLARE_TEST(stdvector) diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index b5464992e..19375191d 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -280,8 +280,8 @@ struct vectorization_logic_half // EIGEN_UNALIGNED_VECTORIZE is 0 (the matrix is assumed unaligned). // Adjust the matrix sizes to account for these alignment issues. enum { PacketBytes = sizeof(Scalar)*PacketSize }; - enum { MinVSize = EIGEN_UNALIGNED_VECTORIZE ? PacketSize - : PacketBytes >= EIGEN_MIN_ALIGN_BYTES ? PacketSize + enum { MinVSize = EIGEN_UNALIGNED_VECTORIZE ? int(PacketSize) + : int(PacketBytes) >= EIGEN_MIN_ALIGN_BYTES ? int(PacketSize) : (EIGEN_MIN_ALIGN_BYTES + sizeof(Scalar) - 1) / sizeof(Scalar) }; typedef Matrix Vector1; diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT index c8c311a60..72cd52143 100644 --- a/unsupported/Eigen/FFT +++ b/unsupported/Eigen/FFT @@ -162,15 +162,16 @@ class FFT typedef typename impl_type::Scalar Scalar; typedef typename impl_type::Complex Complex; - enum Flag { - Default=0, // goof proof - Unscaled=1, - HalfSpectrum=2, - // SomeOtherSpeedOptimization=4 - Speedy=32767 - }; - - FFT( const impl_type & impl=impl_type() , Flag flags=Default ) :m_impl(impl),m_flag(flags) { } + typedef int Flag; + static const Flag Default = 0; + static const Flag Unscaled = 1; + static const Flag HalfSpectrum = 2; + static const Flag Speedy = 32767; + + FFT(const impl_type& impl = impl_type(), Flag flags = Default) : m_impl(impl), m_flag(flags) + { + eigen_assert((flags == Default || flags == Unscaled || flags == HalfSpectrum || flags == Speedy) && "invalid flags argument"); + } inline bool HasFlag(Flag f) const { return (m_flag & (int)f) == f;} -- GitLab From a0956397b31fc70f5df575e3be4bfb9347c1dc65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patrick=20Gl=C3=B6ckner?= Date: Sun, 21 Jan 2024 16:16:53 +0000 Subject: [PATCH 266/266] Fixed warning regarding CMP0146 introduced by CMake 3.28.1 --- unsupported/test/CMakeLists.txt | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index ab5b684e9..e917dffb5 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -289,8 +289,15 @@ endif() endif() -find_package(CUDA 9.0) -if(CUDA_FOUND AND EIGEN_TEST_CUDA) +# These tests needs nvcc +check_language(CUDA) +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) +else() + message(STATUS "Could NOT find CUDA.") +endif() + +if(CMAKE_CUDA_COMPILER AND EIGEN_TEST_CUDA) # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor # and -fno-check-new flags since they trigger thousands of compilation warnings # in the CUDA runtime -- GitLab
OuterStarts:035810\em 12