diff --git a/.gitignore b/.gitignore index f6ab76fdadf4d5f19c6b124909175d3f509b4c33..19dfac9ef6a76a91edd2844e09751a15524e1691 100644 --- a/.gitignore +++ b/.gitignore @@ -36,3 +36,6 @@ lapack/reference .settings Makefile !ci/build.gitlab-ci.yml +!scripts/buildtests.in +!Eigen/Core +!Eigen/src/Core diff --git a/CMakeLists.txt b/CMakeLists.txt index 7ba7d38721875ff3ce7001e22babd2028f414cf0..ac8042b18900370e5d563a1f9a9716049ec89e9f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,16 @@ cmake_minimum_required(VERSION 3.5.0) project(Eigen3) +# Remove this block after bumping CMake to v3.21.0 +# PROJECT_IS_TOP_LEVEL is defined then by default +if(CMAKE_VERSION VERSION_LESS 3.21.0) + if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + set(PROJECT_IS_TOP_LEVEL TRUE) + else() + set(PROJECT_IS_TOP_LEVEL FALSE) + endif() +endif() + # guard against in-source builds if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) @@ -23,7 +33,7 @@ endif() ############################################################################# -# retrieve version information # +# retrieve version information # ############################################################################# # automatically parse the version number @@ -88,6 +98,23 @@ else() ei_add_cxx_compiler_flag("-std=c++03") endif() +function(ei_maybe_separate_arguments variable mode args) + # Use separate_arguments if the input is a single string containing a space. + # Otherwise, if it is already a list or doesn't have a space, just propagate + # the original value. This is to better support multi-argument lists. + list(LENGTH args list_length) + if (${list_length} EQUAL 1) + string(FIND "${args}" " " has_space) + if (${has_space} GREATER -1) + separate_arguments(args ${mode} "${args}") + endif() + endif() + set(${variable} ${args} PARENT_SCOPE) +endfunction(ei_maybe_separate_arguments) + +# Determine if we should build shared libraries on this platform. +get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS) + ############################################################################# # find how to link to the standard libraries # ############################################################################# @@ -98,6 +125,10 @@ find_package(StandardMathLibrary) set(EIGEN_TEST_CUSTOM_LINKER_FLAGS "" CACHE STRING "Additional linker flags when linking unit tests.") set(EIGEN_TEST_CUSTOM_CXX_FLAGS "" CACHE STRING "Additional compiler flags when compiling unit tests.") +# Convert space-separated arguments into CMake lists for downstream consumption. +ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_LINKER_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_LINKER_FLAGS}") +ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_CXX_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_CXX_FLAGS}") + set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "") if(NOT STANDARD_MATH_LIBRARY_FOUND) @@ -106,13 +137,11 @@ if(NOT STANDARD_MATH_LIBRARY_FOUND) "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.") else() - if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}") else() set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}") endif() - endif() if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) @@ -122,6 +151,7 @@ else() endif() option(EIGEN_BUILD_BTL "Build benchmark suite" OFF) +option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF) # Disable pkgconfig only for native Windows builds if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows) @@ -250,18 +280,12 @@ if(NOT MSVC) option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) if(EIGEN_TEST_AVX512) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma") - if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") - endif() message(STATUS "Enabling AVX512 in tests/examples") endif() option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF) if(EIGEN_TEST_AVX512DQ) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq") - if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6") - endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq -mfma") message(STATUS "Enabling AVX512DQ in tests/examples") endif() @@ -366,11 +390,19 @@ else() endif() option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF) - if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) + option(EIGEN_TEST_AVX2 "Enable/Disable FMA/AVX2 in tests/examples" OFF) + if((EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) OR EIGEN_TEST_AVX2) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") message(STATUS "Enabling FMA/AVX2 in tests/examples") endif() + option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF) + option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF) + if(EIGEN_TEST_AVX512 OR EIGEN_TEST_AVX512DQ) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512") + message(STATUS "Enabling AVX512 in tests/examples") + endif() + endif() option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF) @@ -413,7 +445,8 @@ if(EIGEN_TEST_NO_EXCEPTIONS) message(STATUS "Disabling exceptions in tests/examples") endif() -set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code") +set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.") +set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code") include_directories(${CMAKE_CURRENT_SOURCE_DIR}) @@ -424,25 +457,26 @@ endif() if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR) set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR} - CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed") + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed") else() set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_INCLUDEDIR}/eigen3" - CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed" + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed" ) endif() set(CMAKEPACKAGE_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/eigen3/cmake" - CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed" + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen3Config.cmake is installed" ) set(PKGCONFIG_INSTALL_DIR "${CMAKE_INSTALL_DATADIR}/pkgconfig" - CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed" + CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where eigen3.pc is installed" ) foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR) + # If an absolute path is specified, make it relative to "{CMAKE_INSTALL_PREFIX}". if(IS_ABSOLUTE "${${var}}") - message(FATAL_ERROR "${var} must be relative to CMAKE_PREFIX_PATH. Got: ${${var}}") + file(RELATIVE_PATH "${var}" "${CMAKE_INSTALL_PREFIX}" "${${var}}") endif() endforeach() @@ -478,8 +512,9 @@ if(EIGEN_BUILD_DOC) endif() -option(BUILD_TESTING "Enable creation of Eigen tests." ON) -if(BUILD_TESTING) +cmake_dependent_option(BUILD_TESTING "Enable creation of tests." ON "PROJECT_IS_TOP_LEVEL" OFF) +option(EIGEN_BUILD_TESTING "Enable creation of Eigen tests." ${BUILD_TESTING}) +if(EIGEN_BUILD_TESTING) include(EigenConfigureTesting) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) @@ -491,6 +526,9 @@ if(BUILD_TESTING) add_subdirectory(failtest) endif() +include(CMakeDetermineFortranCompiler) +option(EIGEN_BUILD_BLAS "Toggles the building of the Eigen Blas library" ${CMAKE_Fortran_COMPILER}) +option(EIGEN_BUILD_LAPACK "Toggles the building of the included Eigen LAPACK library" ${CMAKE_Fortran_COMPILER}) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) add_subdirectory(blas) add_subdirectory(lapack) @@ -541,13 +579,13 @@ if(EIGEN_BUILD_BTL) add_subdirectory(bench/btl EXCLUDE_FROM_ALL) endif() -if(NOT WIN32) +if(NOT WIN32 AND EIGEN_BUILD_SPBENCH) add_subdirectory(bench/spbench EXCLUDE_FROM_ALL) endif() configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY) -if(BUILD_TESTING) +if(EIGEN_BUILD_TESTING) ei_testing_print_summary() endif() @@ -555,34 +593,35 @@ message(STATUS "") message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}") message(STATUS "") -string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) -if(cmake_generator_tolower MATCHES "makefile") - message(STATUS "Available targets (use: make TARGET):") -else() - message(STATUS "Available targets (use: cmake --build . --target TARGET):") -endif() -message(STATUS "---------+--------------------------------------------------------------") -message(STATUS "Target | Description") -message(STATUS "---------+--------------------------------------------------------------") -message(STATUS "install | Install Eigen. Headers will be installed to:") -message(STATUS " | /") -message(STATUS " | Using the following values:") -message(STATUS " | CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") -message(STATUS " | INCLUDE_INSTALL_DIR: ${INCLUDE_INSTALL_DIR}") -message(STATUS " | Change the install location of Eigen headers using:") -message(STATUS " | cmake . -DCMAKE_INSTALL_PREFIX=yourprefix") -message(STATUS " | Or:") -message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") -message(STATUS "doc | Generate the API documentation, requires Doxygen & LaTeX") -if(BUILD_TESTING) - message(STATUS "check | Build and run the unit-tests. Read this page:") - message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") +if(PROJECT_IS_TOP_LEVEL) + string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower) + if(cmake_generator_tolower MATCHES "makefile") + message(STATUS "Available targets (use: make TARGET):") + else() + message(STATUS "Available targets (use: cmake --build . --target TARGET):") + endif() + message(STATUS "---------+--------------------------------------------------------------") + message(STATUS "Target | Description") + message(STATUS "---------+--------------------------------------------------------------") + message(STATUS "install | Install Eigen. Headers will be installed to:") + message(STATUS " | /") + message(STATUS " | Using the following values:") + message(STATUS " | CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}") + message(STATUS " | INCLUDE_INSTALL_DIR: ${INCLUDE_INSTALL_DIR}") + message(STATUS " | Change the install location of Eigen headers using:") + message(STATUS " | cmake . -DCMAKE_INSTALL_PREFIX=yourprefix") + message(STATUS " | Or:") + message(STATUS " | cmake . -DINCLUDE_INSTALL_DIR=yourdir") + message(STATUS "doc | Generate the API documentation, requires Doxygen & LaTeX") + if(EIGEN_BUILD_TESTING) + message(STATUS "check | Build and run the unit-tests. Read this page:") + message(STATUS " | http://eigen.tuxfamily.org/index.php?title=Tests") + endif() + message(STATUS "blas | Build BLAS library (not the same thing as Eigen)") + message(STATUS "uninstall| Remove files installed by the install target") + message(STATUS "---------+--------------------------------------------------------------") + message(STATUS "") endif() -message(STATUS "blas | Build BLAS library (not the same thing as Eigen)") -message(STATUS "uninstall| Remove files installed by the install target") -message(STATUS "---------+--------------------------------------------------------------") -message(STATUS "") - set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} ) set ( EIGEN_VERSION_MAJOR ${EIGEN_WORLD_VERSION} ) @@ -608,6 +647,8 @@ set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen) install (TARGETS eigen EXPORT Eigen3Targets) +option(EIGEN_BUILD_CMAKE_PACKAGE "Enables the creation of EigenConfig.cmake and related files" ON) +if(EIGEN_BUILD_CMAKE_PACKAGE) configure_package_config_file ( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake @@ -643,6 +684,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake # Add uninstall target add_custom_target ( uninstall COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake) +endif() if (EIGEN_SPLIT_TESTSUITE) ei_split_testsuite("${EIGEN_SPLIT_TESTSUITE}") diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport index bed8924d31e0b3f46713cf74ba3deb6a63f9f590..1037bd55de367eaae9cc2484d39acc2146f3cadb 100644 --- a/Eigen/CholmodSupport +++ b/Eigen/CholmodSupport @@ -22,7 +22,7 @@ extern "C" { * This module provides an interface to the Cholmod library which is part of the suitesparse package. * It provides the two following main factorization classes: * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization. - * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). + * - class CholmodDecomposition: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial). * * For the sake of completeness, this module also propose the two following classes: * - class CholmodSimplicialLLT diff --git a/Eigen/Core b/Eigen/Core index 5921e15f9df46319ceb5436296b3271ca4dc9e65..1e53ba49b5ecfa9bb7d0bdeeb34648a3f6772123 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -83,8 +83,8 @@ #include #include #include -#include #ifndef EIGEN_NO_IO + #include #include #endif #include @@ -109,7 +109,8 @@ #endif // required for __cpuid, needs to be included after cmath -#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE +// also required for _BitScanReverse on Windows on ARM +#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE #include #endif diff --git a/Eigen/LU b/Eigen/LU index 0fb184bcb84df406c281730421ba3051e03e8c1c..1236ceb04676f5e180b589f8ca70c3d9362710cf 100644 --- a/Eigen/LU +++ b/Eigen/LU @@ -38,9 +38,7 @@ #include "src/LU/Determinant.h" #include "src/LU/InverseImpl.h" -// Use the SSE optimized version whenever possible. At the moment the -// SSE version doesn't compile when AVX is enabled -#if (defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX) || defined EIGEN_VECTORIZE_NEON +#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON #include "src/LU/arch/InverseSize4.h" #endif diff --git a/Eigen/SparseLU b/Eigen/SparseLU index 37c4a5c5a8b305add93e2aae0eb7eac624f3ce68..047cf0dca94923dd820733d6a549c8db3ee4a1af 100644 --- a/Eigen/SparseLU +++ b/Eigen/SparseLU @@ -25,8 +25,6 @@ #include "src/Core/util/DisableStupidWarnings.h" -#include "src/SparseLU/SparseLU_gemm_kernel.h" - #include "src/SparseLU/SparseLU_Structs.h" #include "src/SparseLU/SparseLU_SupernodalMatrix.h" #include "src/SparseLU/SparseLUImpl.h" diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index ab2ebf37e6c72cfee6b09c109cfb2aacc3ba0f14..7d76f0c256fd6207819921680217b30697e81ca3 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -591,7 +591,7 @@ struct dense_assignment_loop enum { innerSize = DstXprType::InnerSizeAtCompileTime, packetSize =unpacket_traits::size, - vectorizableSize = (innerSize/packetSize)*packetSize, + vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize), size = DstXprType::SizeAtCompileTime }; for(Index outer = 0; outer < kernel.outerSize(); ++outer) @@ -785,6 +785,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType dense_assignment_loop::run(kernel); } +// Specialization for filling the destination with a constant value. +#ifndef EIGEN_GPU_COMPILE_PHASE +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp, DstXprType>& src, const internal::assign_op& func) +{ + resize_if_allowed(dst, src, func); + std::fill_n(dst.data(), dst.size(), src.functor()()); +} +#endif + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) { diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h index 480e0449556319bec2d73773a2c65a82ea4d8aa9..878c0240ac1699abd0b05e2d34cea9fa4929fe72 100644 --- a/Eigen/src/Core/BandMatrix.h +++ b/Eigen/src/Core/BandMatrix.h @@ -67,7 +67,7 @@ class BandMatrixBase : public EigenBase * \warning the internal storage must be column major. */ inline Block col(Index i) { - EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); + EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES); Index start = 0; Index len = coeffs().rows(); if (i<=supers()) @@ -90,7 +90,7 @@ class BandMatrixBase : public EigenBase template struct DiagonalIntReturnType { enum { - ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)), + ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)), Conjugate = ReturnOpposite && NumTraits::IsComplex, ActualIndex = ReturnOpposite ? -Index : Index, DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic) @@ -192,7 +192,7 @@ struct traits > Options = _Options, DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic }; - typedef Matrix CoefficientsType; + typedef Matrix CoefficientsType; }; template diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h index 3206d6633e62c29f4466155f47725511a34ba640..9d89b60cf8f0d4b4740fad1f0bf41f5fd8a1bffd 100644 --- a/Eigen/src/Core/Block.h +++ b/Eigen/src/Core/Block.h @@ -260,19 +260,19 @@ template - inline PacketScalar packet(Index rowId, Index colId) const + EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const { return m_xpr.template packet(rowId + m_startRow.value(), colId + m_startCol.value()); } template - inline void writePacket(Index rowId, Index colId, const PacketScalar& val) + EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val) { m_xpr.template writePacket(rowId + m_startRow.value(), colId + m_startCol.value(), val); } template - inline PacketScalar packet(Index index) const + EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const { return m_xpr.template packet (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), @@ -280,7 +280,7 @@ template - inline void writePacket(Index index, const PacketScalar& val) + EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val) { m_xpr.template writePacket (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index), @@ -334,6 +334,17 @@ class BlockImpl_dense enum { XprTypeIsRowMajor = (int(traits::Flags)&RowMajorBit) != 0 }; + + /** \internal Returns base+offset (unless base is null, in which case returns null). + * Adding an offset to nullptr is undefined behavior, so we must avoid it. + */ + template + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE + static Scalar* add_to_nullable_pointer(Scalar* base, Index offset) + { + return base != NULL ? base+offset : NULL; + } + public: typedef MapBase Base; @@ -344,8 +355,9 @@ class BlockImpl_dense */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index i) - : Base(xpr.data() + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) - || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()), + : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(), + i * ( ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) + || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride())), BlockRows==1 ? 1 : xpr.rows(), BlockCols==1 ? 1 : xpr.cols()), m_xpr(xpr), @@ -359,7 +371,8 @@ class BlockImpl_dense */ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index startRow, Index startCol) - : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), + : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(), + xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol))), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); @@ -371,7 +384,9 @@ class BlockImpl_dense BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols) - : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols), + : Base((blockRows == 0 || blockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(), + xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)), + blockRows, blockCols), m_xpr(xpr), m_startRow(startRow), m_startCol(startCol) { init(); diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h index e32c4ac5b17415ff70fdb15ff1dbbe8cd3a7c760..fa4d7c33114f2112dc199dcd32a9307e57eb842f 100644 --- a/Eigen/src/Core/BooleanRedux.h +++ b/Eigen/src/Core/BooleanRedux.h @@ -14,54 +14,56 @@ namespace Eigen { namespace internal { -template +template struct all_unroller { enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + IsRowMajor = (int(Derived::Flags) & int(RowMajor)), + i = (UnrollCount-1) / InnerSize, + j = (UnrollCount-1) % InnerSize }; EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return all_unroller::run(mat) && mat.coeff(row, col); + return all_unroller::run(mat) && mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i); } }; -template -struct all_unroller +template +struct all_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; } }; -template -struct all_unroller +template +struct all_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; -template +template struct any_unroller { enum { - col = (UnrollCount-1) / Rows, - row = (UnrollCount-1) % Rows + IsRowMajor = (int(Derived::Flags) & int(RowMajor)), + i = (UnrollCount-1) / InnerSize, + j = (UnrollCount-1) % InnerSize }; - + EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat) { - return any_unroller::run(mat) || mat.coeff(row, col); + return any_unroller::run(mat) || mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i); } }; -template -struct any_unroller +template +struct any_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; } }; -template -struct any_unroller +template +struct any_unroller { EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; } }; @@ -81,16 +83,16 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::all() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) - return internal::all_unroller::RowsAtCompileTime>::run(evaluator); + return internal::all_unroller::run(evaluator); else { - for(Index j = 0; j < cols(); ++j) - for(Index i = 0; i < rows(); ++i) - if (!evaluator.coeff(i, j)) return false; + for(Index i = 0; i < derived().outerSize(); ++i) + for(Index j = 0; j < derived().innerSize(); ++j) + if (!evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return false; return true; } } @@ -105,16 +107,16 @@ EIGEN_DEVICE_FUNC inline bool DenseBase::any() const typedef internal::evaluator Evaluator; enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits::AddCost) <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits::AddCost)) <= EIGEN_UNROLLING_LIMIT }; Evaluator evaluator(derived()); if(unroll) - return internal::any_unroller::RowsAtCompileTime>::run(evaluator); + return internal::any_unroller::run(evaluator); else { - for(Index j = 0; j < cols(); ++j) - for(Index i = 0; i < rows(); ++i) - if (evaluator.coeff(i, j)) return true; + for(Index i = 0; i < derived().outerSize(); ++i) + for(Index j = 0; j < derived().innerSize(); ++j) + if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return true; return false; } } @@ -156,7 +158,7 @@ inline bool DenseBase::allFinite() const return !((derived()-derived()).hasNaN()); #endif } - + } // end namespace Eigen #endif // EIGEN_ALLANDANY_H diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 90c552f13e24bfd338705548d25136dcc494e79e..0ff8c8deb820deb7bbaa2f762afc5e4366f29616 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -561,7 +561,7 @@ struct unary_evaluator, IndexBased > typedef CwiseUnaryOp XprType; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = evaluator::Flags & (HereditaryBits | LinearAccessBit | (functor_traits::PacketAccess ? PacketAccessBit : 0)), @@ -606,13 +606,13 @@ struct unary_evaluator, IndexBased > protected: // this helper permits to completely eliminate the functor if it is empty - class Data : private UnaryOp + struct Data { - public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {} + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const UnaryOp& func() const { return static_cast(*this); } + const UnaryOp& func() const { return op; } + UnaryOp op; evaluator argImpl; }; @@ -639,7 +639,7 @@ struct ternary_evaluator, IndexBased typedef CwiseTernaryOp XprType; enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Arg1Flags = evaluator::Flags, Arg2Flags = evaluator::Flags, @@ -700,12 +700,13 @@ struct ternary_evaluator, IndexBased protected: // this helper permits to completely eliminate the functor if it is empty - struct Data : private TernaryOp + struct Data { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : TernaryOp(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {} + Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const TernaryOp& func() const { return static_cast(*this); } + const TernaryOp& func() const { return op; } + TernaryOp op; evaluator arg1Impl; evaluator arg2Impl; evaluator arg3Impl; @@ -735,7 +736,7 @@ struct binary_evaluator, IndexBased, IndexBase typedef CwiseBinaryOp XprType; enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), LhsFlags = evaluator::Flags, RhsFlags = evaluator::Flags, @@ -793,12 +794,13 @@ struct binary_evaluator, IndexBased, IndexBase protected: // this helper permits to completely eliminate the functor if it is empty - struct Data : private BinaryOp + struct Data { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : BinaryOp(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {} + Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const BinaryOp& func() const { return static_cast(*this); } + const BinaryOp& func() const { return op; } + BinaryOp op; evaluator lhsImpl; evaluator rhsImpl; }; @@ -815,7 +817,7 @@ struct unary_evaluator, IndexBased> typedef CwiseUnaryView XprType; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = (evaluator::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)), @@ -858,12 +860,13 @@ struct unary_evaluator, IndexBased> protected: // this helper permits to completely eliminate the functor if it is empty - struct Data : private UnaryOp + struct Data { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {} + Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - const UnaryOp& func() const { return static_cast(*this); } + const UnaryOp& func() const { return op; } + UnaryOp op; evaluator argImpl; }; diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h index 59974a545177dd4e74fc351724bbb6eddce06ce3..2202b1cc6b78c19de49e23ab81fd6e0982372996 100644 --- a/Eigen/src/Core/CwiseBinaryOp.h +++ b/Eigen/src/Core/CwiseBinaryOp.h @@ -102,7 +102,7 @@ class CwiseBinaryOp : #if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11 //Required for Visual Studio or the Copy constructor will probably not get inlined! - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_STRONG_INLINE CwiseBinaryOp(const CwiseBinaryOp&) = default; #endif diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h index 9b16db68d48d2bfbf7e5c076ef2ba1504074f3f3..cdd0f5f168ea51f446128d57b444eae5d5db2c70 100644 --- a/Eigen/src/Core/DenseBase.h +++ b/Eigen/src/Core/DenseBase.h @@ -324,9 +324,9 @@ template class DenseBase typedef Transpose TransposeReturnType; EIGEN_DEVICE_FUNC TransposeReturnType transpose(); - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; EIGEN_DEVICE_FUNC - ConstTransposeReturnType transpose() const; + const ConstTransposeReturnType transpose() const; EIGEN_DEVICE_FUNC void transposeInPlace(); diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h index f6e1d0af1a9254e0e442c7846be419eb5b24cefd..08ef6c530617401094344790e64a7ac21addb2ad 100644 --- a/Eigen/src/Core/DenseStorage.h +++ b/Eigen/src/Core/DenseStorage.h @@ -163,6 +163,30 @@ struct plain_array EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {} }; +struct plain_array_helper { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void copy(const plain_array& src, const Eigen::Index size, + plain_array& dst) { + smart_copy(src.array, src.array + size, dst.array); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + static void swap(plain_array& a, const Eigen::Index a_size, + plain_array& b, const Eigen::Index b_size) { + if (a_size < b_size) { + std::swap_ranges(b.array, b.array + a_size, a.array); + smart_move(b.array + a_size, b.array + b_size, a.array + a_size); + } else if (a_size > b_size) { + std::swap_ranges(a.array, a.array + b_size, b.array); + smart_move(a.array + b_size, a.array + a_size, b.array + b_size); + } else { + std::swap_ranges(a.array, a.array + a_size, b.array); + } + } +}; + } // end namespace internal /** \internal @@ -190,17 +214,26 @@ template class DenseSt EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(internal::constructor_without_unaligned_array_assert()) {} +#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) } +#else + EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default; +#endif +#if !EIGEN_HAS_CXX11 EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { if (this != &other) m_data = other.m_data; return *this; } +#else + EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default; +#endif #if EIGEN_HAS_RVALUE_REFERENCES +#if !EIGEN_HAS_CXX11 EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT : m_data(std::move(other.m_data)) { @@ -211,6 +244,10 @@ template class DenseSt m_data = std::move(other.m_data); return *this; } +#else + EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default; + EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default; +#endif #endif EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) @@ -268,21 +305,25 @@ template class DenseStorage class DenseStorage class DenseStorage::diagonal() /** This is the const version of diagonal(). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalReturnType +EIGEN_DEVICE_FUNC inline +const typename MatrixBase::ConstDiagonalReturnType MatrixBase::diagonal() const { return ConstDiagonalReturnType(derived()); @@ -209,18 +210,18 @@ MatrixBase::diagonal() const * * \sa MatrixBase::diagonal(), class Diagonal */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::DiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline Diagonal MatrixBase::diagonal(Index index) { - return DiagonalDynamicIndexReturnType(derived(), index); + return Diagonal(derived(), index); } /** This is the const version of diagonal(Index). */ template -EIGEN_DEVICE_FUNC inline typename MatrixBase::ConstDiagonalDynamicIndexReturnType +EIGEN_DEVICE_FUNC inline const Diagonal MatrixBase::diagonal(Index index) const { - return ConstDiagonalDynamicIndexReturnType(derived(), index); + return Diagonal(derived(), index); } /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this @@ -237,20 +238,20 @@ MatrixBase::diagonal(Index index) const template template EIGEN_DEVICE_FUNC -inline typename MatrixBase::template DiagonalIndexReturnType::Type +inline Diagonal MatrixBase::diagonal() { - return typename DiagonalIndexReturnType::Type(derived()); + return Diagonal(derived()); } /** This is the const version of diagonal(). */ template template EIGEN_DEVICE_FUNC -inline typename MatrixBase::template ConstDiagonalIndexReturnType::Type +inline const Diagonal MatrixBase::diagonal() const { - return typename ConstDiagonalIndexReturnType::Type(derived()); + return Diagonal(derived()); } } // end namespace Eigen diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h index 41a8cb437bc3f6ffdff988b49cbd0cff699d3d8d..abac7ad48ed85befd608a2522fdbdb187187dc6f 100644 --- a/Eigen/src/Core/Dot.h +++ b/Eigen/src/Core/Dot.h @@ -18,14 +18,9 @@ namespace internal { // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE // looking at the static assertions. Thus this is a trick to get better compile errors. template + bool NeedToTranspose = T::IsVectorAtCompileTime && U::IsVectorAtCompileTime && + ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1) || + (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))> struct dot_nocheck { typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; @@ -86,7 +81,7 @@ MatrixBase::dot(const MatrixBase& other) const //---------- implementation of L2 norm and related functions ---------- -/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm. +/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm. * In both cases, it consists in the sum of the square of all the matrix entries. * For vectors, this is also equals to the dot product of \c *this with itself. * diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 53800a005771016cac01660ede7d818e4cbe1608..cf677a1905f1e80697eb6c0daccb48f745c605e3 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -129,6 +129,22 @@ template struct packet_traits : default_packet_traits template struct packet_traits : packet_traits { }; +template struct unpacket_traits +{ + typedef T type; + typedef T half; + enum + { + size = 1, + alignment = 1, + vectorizable = false, + masked_load_available=false, + masked_store_available=false + }; +}; + +template struct unpacket_traits : unpacket_traits { }; + template struct type_casting_traits { enum { VectorizedCast = 0, @@ -154,6 +170,18 @@ struct eigen_packet_wrapper T m_val; }; + +/** \internal A convenience utility for determining if the type is a scalar. + * This is used to enable some generic packet implementations. + */ +template +struct is_scalar { + typedef typename unpacket_traits::type Scalar; + enum { + value = internal::is_same::value + }; +}; + /** \internal \returns static_cast(a) (coeff-wise) */ template EIGEN_DEVICE_FUNC inline TgtPacket @@ -215,13 +243,59 @@ pmul(const bool& a, const bool& b) { return a && b; } template EIGEN_DEVICE_FUNC inline Packet pdiv(const Packet& a, const Packet& b) { return a/b; } -/** \internal \returns one bits */ +// In the generic case, memset to all one bits. +template +struct ptrue_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/){ + Packet b; + memset(static_cast(&b), 0xff, sizeof(Packet)); + return b; + } +}; + +// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value). +// Although this is technically not a valid bitmask, the scalar path for pselect +// uses a comparison to zero, so this should still work in most cases. We don't +// have another option, since the scalar type requires initialization. +template +struct ptrue_impl::value && NumTraits::RequireInitialization>::type > { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){ + return T(1); + } +}; + +/** \internal \returns one bits. */ template EIGEN_DEVICE_FUNC inline Packet -ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;} +ptrue(const Packet& a) { + return ptrue_impl::run(a); +} -/** \internal \returns zero bits */ +// In the general case, memset to zero. +template +struct pzero_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) { + Packet b; + memset(static_cast(&b), 0x00, sizeof(Packet)); + return b; + } +}; + +// For scalars, explicitly set to Scalar(0), since the underlying representation +// for zero may not consist of all-zero bits. +template +struct pzero_impl::value>::type> { + static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { + return T(0); + } +}; + +/** \internal \returns packet of zeros */ template EIGEN_DEVICE_FUNC inline Packet -pzero(const Packet& /*a*/) { Packet b; memset((void*)&b, 0, sizeof(b)); return b;} +pzero(const Packet& a) { + return pzero_impl::run(a); +} /** \internal \returns a <= b as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet @@ -238,33 +312,6 @@ pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); } /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */ template EIGEN_DEVICE_FUNC inline Packet pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); } -template<> EIGEN_DEVICE_FUNC inline float pzero(const float& a) { - EIGEN_UNUSED_VARIABLE(a) - return 0.f; -} - -template<> EIGEN_DEVICE_FUNC inline double pzero(const double& a) { - EIGEN_UNUSED_VARIABLE(a) - return 0.; -} - -template -EIGEN_DEVICE_FUNC inline std::complex ptrue(const std::complex& /*a*/) { - RealScalar b = ptrue(RealScalar(0)); - return std::complex(b, b); -} - -template -EIGEN_DEVICE_FUNC inline Packet bitwise_helper(const Packet& a, const Packet& b, Op op) { - const unsigned char* a_ptr = reinterpret_cast(&a); - const unsigned char* b_ptr = reinterpret_cast(&b); - Packet c; - unsigned char* c_ptr = reinterpret_cast(&c); - for (size_t i = 0; i < sizeof(Packet); ++i) { - *c_ptr++ = op(*a_ptr++, *b_ptr++); - } - return c; -} template struct bit_and { @@ -287,42 +334,123 @@ struct bit_xor { } }; +template +struct bit_not { + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const { + return ~a; + } +}; + +// Use operators &, |, ^, ~. +template +struct operator_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor()(a, b); } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not()(a); } +}; + +// Apply binary operations byte-by-byte +template +struct bytewise_bitwise_helper { + EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { + return binary(a, b, bit_and()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { + return binary(a, b, bit_or()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { + return binary(a, b, bit_xor()); + } + EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { + return unary(a,bit_not()); + } + + private: + template + EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++); + } + return c; + } + + template + EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) { + const unsigned char* a_ptr = reinterpret_cast(&a); + const unsigned char* b_ptr = reinterpret_cast(&b); + T c; + unsigned char* c_ptr = reinterpret_cast(&c); + for (size_t i = 0; i < sizeof(T); ++i) { + *c_ptr++ = op(*a_ptr++, *b_ptr++); + } + return c; + } +}; + +// In the general case, use byte-by-byte manipulation. +template +struct bitwise_helper : public bytewise_bitwise_helper {}; + +// For integers or non-trivial scalars, use binary operators. +template +struct bitwise_helper::value && (NumTraits::IsInteger || NumTraits::RequireInitialization)>::type + > : public operator_bitwise_helper {}; + /** \internal \returns the bitwise and of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet pand(const Packet& a, const Packet& b) { - return bitwise_helper(a, b, bit_and()); + return bitwise_helper::bitwise_and(a, b); } /** \internal \returns the bitwise or of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet por(const Packet& a, const Packet& b) { - return bitwise_helper(a ,b, bit_or()); + return bitwise_helper::bitwise_or(a, b); } /** \internal \returns the bitwise xor of \a a and \a b */ template EIGEN_DEVICE_FUNC inline Packet pxor(const Packet& a, const Packet& b) { - return bitwise_helper(a ,b, bit_xor()); + return bitwise_helper::bitwise_xor(a, b); +} + +/** \internal \returns the bitwise not of \a a */ +template EIGEN_DEVICE_FUNC inline Packet +pnot(const Packet& a) { + return bitwise_helper::bitwise_not(a); } /** \internal \returns the bitwise and of \a a and not \a b */ template EIGEN_DEVICE_FUNC inline Packet -pandnot(const Packet& a, const Packet& b) { return pand(a, pxor(ptrue(b), b)); } +pandnot(const Packet& a, const Packet& b) { return pand(a, pnot(b)); } + +// In the general case, use bitwise select. +template +struct pselect_impl { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return por(pand(a,mask),pandnot(b,mask)); + } +}; + +// For scalars, use ternary select. +template +struct pselect_impl::value>::type > { + static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) { + return numext::equal_strict(mask, Packet(0)) ? b : a; + } +}; /** \internal \returns \a or \b for each field in packet according to \mask */ template EIGEN_DEVICE_FUNC inline Packet pselect(const Packet& mask, const Packet& a, const Packet& b) { - return por(pand(a,mask),pandnot(b,mask)); -} - -template<> EIGEN_DEVICE_FUNC inline float pselect( - const float& cond, const float& a, const float&b) { - return numext::equal_strict(cond,0.f) ? b : a; -} - -template<> EIGEN_DEVICE_FUNC inline double pselect( - const double& cond, const double& a, const double& b) { - return numext::equal_strict(cond,0.) ? b : a; + return pselect_impl::run(mask, a, b); } template<> EIGEN_DEVICE_FUNC inline bool pselect( diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h index 08476251d32234dfdd2e9b0cba788c58484bc458..05c2bc9cc632c9fb0fdcbddda604dca586c77808 100644 --- a/Eigen/src/Core/IndexedView.h +++ b/Eigen/src/Core/IndexedView.h @@ -122,10 +122,10 @@ public: {} /** \returns number of rows */ - Index rows() const { return internal::size(m_rowIndices); } + Index rows() const { return internal::index_list_size(m_rowIndices); } /** \returns number of columns */ - Index cols() const { return internal::size(m_colIndices); } + Index cols() const { return internal::index_list_size(m_colIndices); } /** \returns the nested expression */ const typename internal::remove_all::type& @@ -189,12 +189,16 @@ struct unary_evaluator, IndexBased> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const { + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -204,6 +208,8 @@ struct unary_evaluator, IndexBased> EIGEN_STATIC_ASSERT_LVALUE(XprType) Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -212,6 +218,8 @@ struct unary_evaluator, IndexBased> { Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } @@ -220,6 +228,8 @@ struct unary_evaluator, IndexBased> { Index row = XprType::RowsAtCompileTime == 1 ? 0 : index; Index col = XprType::RowsAtCompileTime == 1 ? index : 0; + eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() + && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols()); return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]); } diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h index 93d2ae9071d599188ad031930d719222c8d36e18..218cc157f386924cae44ca6741d6aaba9fc96f74 100644 --- a/Eigen/src/Core/Map.h +++ b/Eigen/src/Core/Map.h @@ -47,7 +47,7 @@ private: * \brief A matrix or vector expression mapping an existing array of data. * * \tparam PlainObjectType the equivalent matrix type of the mapped data - * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. + * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned. * The default is \c #Unaligned. * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout * of an ordinary, contiguous array. This can be overridden by specifying strides. diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 29201214f0ed5ab1c5238e8761e9bb80bf21e66b..764c41c974f3f92efb971f216636c02500145664 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -2,6 +2,7 @@ // for linear algebra. // // Copyright (C) 2006-2010 Benoit Jacob +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -260,19 +261,8 @@ struct conj_default_impl } }; -template struct conj_impl : conj_default_impl {}; - -#if defined(EIGEN_GPU_COMPILE_PHASE) -template -struct conj_impl > -{ - EIGEN_DEVICE_FUNC - static inline std::complex run(const std::complex& x) - { - return std::complex(x.real(), -x.imag()); - } -}; -#endif +template::IsComplex> +struct conj_impl : conj_default_impl {}; template struct conj_retval @@ -582,7 +572,9 @@ struct rint_retval * Implementation of arg * ****************************************************************************/ -#if EIGEN_HAS_CXX11_MATH +// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs. +// This seems to be fixed in VS 2019. +#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920) // std::arg is only defined for types of std::complex, or integer types or float/double/long double template::IsComplex || is_integral::value @@ -592,16 +584,13 @@ struct arg_default_impl; template struct arg_default_impl { + typedef typename NumTraits::Real RealScalar; EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) + static inline RealScalar run(const Scalar& x) { - #if defined(EIGEN_HIP_DEVICE_COMPILE) - // HIP does not seem to have a native device side implementation for the math routine "arg" + // There is no official ::arg on device in CUDA/HIP, so we always need to use std::arg. using std::arg; - #else - EIGEN_USING_STD(arg); - #endif - return static_cast(arg(x)); + return static_cast(arg(x)); } }; @@ -612,7 +601,7 @@ struct arg_default_impl { EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); + return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); } }; #else @@ -623,7 +612,7 @@ struct arg_default_impl EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); + return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0); } }; @@ -697,6 +686,30 @@ struct expm1_retval typedef Scalar type; }; +/**************************************************************************** +* Implementation of log * +****************************************************************************/ + +// Complex log defined in MathFunctionsImpl.h. +template EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z); + +template +struct log_impl { + EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) + { + EIGEN_USING_STD(log); + return static_cast(log(x)); + } +}; + +template +struct log_impl > { + EIGEN_DEVICE_FUNC static inline std::complex run(const std::complex& z) + { + return complex_log(z); + } +}; + /**************************************************************************** * Implementation of log1p * ****************************************************************************/ @@ -710,7 +723,7 @@ namespace std_fallback { typedef typename NumTraits::Real RealScalar; EIGEN_USING_STD(log); Scalar x1p = RealScalar(1) + x; - Scalar log_1p = log(x1p); + Scalar log_1p = log_impl::run(x1p); const bool is_small = numext::equal_strict(x1p, Scalar(1)); const bool is_inf = numext::equal_strict(x1p, log_1p); return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1))); @@ -864,13 +877,159 @@ struct meta_floor_log2 // no value, error at compile time }; -template -struct random_default_impl -{ - static inline Scalar run(const Scalar& x, const Scalar& y) - { - if (y <= x) - return x; +template +struct count_bits_impl { + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT( + is_integral::value && !NumTraits::IsSigned, + THIS_TYPE_IS_NOT_SUPPORTED); + int n = CHAR_BIT * sizeof(BitsType); + int shift = n / 2; + while (bits > 0 && shift > 0) { + BitsType y = bits >> shift; + if (y > 0) { + n -= shift; + bits = y; + } + shift /= 2; + } + if (shift == 0) { + --n; + } + return n; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT( + is_integral::value && !NumTraits::IsSigned, + THIS_TYPE_IS_NOT_SUPPORTED); + int n = CHAR_BIT * sizeof(BitsType); + int shift = n / 2; + while (bits > 0 && shift > 0) { + BitsType y = bits << shift; + if (y > 0) { + n -= shift; + bits = y; + } + shift /= 2; + } + if (shift == 0) { + --n; + } + return n; + } +}; + +// Count leading zeros. +template +EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + return count_bits_impl::clz(bits); +} + +// Count trailing zeros. +template +EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + return count_bits_impl::ctz(bits); +} + +#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG + +template +struct count_bits_impl::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + static const int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clz(static_cast(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + return bits == 0 ? kNumBits : __builtin_ctz(static_cast(bits)); + } +}; + +template +struct count_bits_impl< + BitsType, typename enable_if::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + static const int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clzl(static_cast(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + return bits == 0 ? kNumBits : __builtin_ctzl(static_cast(bits)); + } +}; + +template +struct count_bits_impl::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + static const int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT; + return bits == 0 ? kNumBits : __builtin_clzll(static_cast(bits)) - kLeadingBitsOffset; + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + return bits == 0 ? kNumBits : __builtin_ctzll(static_cast(bits)); + } +}; + +#elif EIGEN_COMP_MSVC + +template +struct count_bits_impl::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + unsigned long out; + _BitScanReverse(&out, static_cast(bits)); + return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast(out); + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + unsigned long out; + _BitScanForward(&out, static_cast(bits)); + return bits == 0 ? kNumBits : static_cast(out); + } +}; + +#ifdef _WIN64 + +template +struct count_bits_impl< + BitsType, typename enable_if::type> { + static const int kNumBits = static_cast(sizeof(BitsType) * CHAR_BIT); + static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + unsigned long out; + _BitScanReverse64(&out, static_cast(bits)); + return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast(out); + } + + static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) { + EIGEN_STATIC_ASSERT(is_integral::value, THIS_TYPE_IS_NOT_SUPPORTED); + unsigned long out; + _BitScanForward64(&out, static_cast(bits)); + return bits == 0 ? kNumBits : static_cast(out); + } +}; + +#endif // _WIN64 + +#endif // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG + +template +struct random_default_impl { + static inline Scalar run(const Scalar& x, const Scalar& y) { + if (y <= x) return x; // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself. typedef typename make_unsigned::type ScalarU; // ScalarX is the widest of ScalarU and unsigned int. @@ -1015,11 +1174,15 @@ template EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) } //MSVC defines a _isnan builtin function, but for double only +#ifndef EIGEN_GPU_COMPILE_PHASE EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; } +#endif EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { return _isnan(x)!=0; } EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { return _isnan(x)!=0; } +#ifndef EIGEN_GPU_COMPILE_PHASE EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); } +#endif EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) { return isinf_msvc_helper(x); } EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) { return isinf_msvc_helper(x); } @@ -1033,12 +1196,16 @@ EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) { return isinf_ms #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only"))) #endif +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); } +#endif template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) { return __builtin_isnan(x); } template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) { return __builtin_isnan(x); } template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) { return __builtin_isinf(x); } template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) { return __builtin_isinf(x); } +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); } +#endif #undef EIGEN_TMP_NOOPT_ATTRIB @@ -1095,6 +1262,8 @@ EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) { return fmin(x, y); } + +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) @@ -1106,6 +1275,7 @@ EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) return fminl(x, y); #endif } +#endif template EIGEN_DEVICE_FUNC @@ -1125,6 +1295,7 @@ EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) { return fmax(x, y); } +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) @@ -1137,6 +1308,7 @@ EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) #endif } #endif +#endif #if defined(SYCL_DEVICE_ONLY) @@ -1293,8 +1465,8 @@ EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y) return fabs(x - y); } -#if !defined(EIGEN_GPUCC) // HIP and CUDA do not support long double. +#ifndef EIGEN_GPU_COMPILE_PHASE template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) { @@ -1470,8 +1642,7 @@ T rsqrt(const T& x) template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T &x) { - EIGEN_USING_STD(log); - return static_cast(log(x)); + return internal::log_impl::run(x); } #if defined(SYCL_DEVICE_ONLY) @@ -2022,6 +2193,18 @@ struct rsqrt_impl { } }; +#if defined(EIGEN_GPU_COMPILE_PHASE) +template +struct conj_impl, true> +{ + EIGEN_DEVICE_FUNC + static inline std::complex run(const std::complex& x) + { + return std::complex(numext::real(x), -numext::imag(x)); + } +}; +#endif + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h index 0d3f317bbe4a312bdd017288c71e2101927c3c76..4eaaaa78449031830cea7446210226afcd2467d0 100644 --- a/Eigen/src/Core/MathFunctionsImpl.h +++ b/Eigen/src/Core/MathFunctionsImpl.h @@ -184,6 +184,15 @@ EIGEN_DEVICE_FUNC std::complex complex_rsqrt(const std::complex& z) { : std::complex(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz ); } +template +EIGEN_DEVICE_FUNC std::complex complex_log(const std::complex& z) { + // Computes complex log. + T a = numext::abs(z); + EIGEN_USING_STD(atan2); + T b = atan2(z.imag(), z.real()); + return std::complex(numext::log(a), b); +} + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h index 45c3a596ecb09d5e55f4eaf3c742d0badb624373..d93a7e377ac1f1b83047e9e2220cc3f932fc9ad5 100644 --- a/Eigen/src/Core/MatrixBase.h +++ b/Eigen/src/Core/MatrixBase.h @@ -206,28 +206,22 @@ template class MatrixBase EIGEN_DEVICE_FUNC DiagonalReturnType diagonal(); - typedef typename internal::add_const >::type ConstDiagonalReturnType; + typedef Diagonal ConstDiagonalReturnType; EIGEN_DEVICE_FUNC - ConstDiagonalReturnType diagonal() const; - - template struct DiagonalIndexReturnType { typedef Diagonal Type; }; - template struct ConstDiagonalIndexReturnType { typedef const Diagonal Type; }; + const ConstDiagonalReturnType diagonal() const; template EIGEN_DEVICE_FUNC - typename DiagonalIndexReturnType::Type diagonal(); + Diagonal diagonal(); template EIGEN_DEVICE_FUNC - typename ConstDiagonalIndexReturnType::Type diagonal() const; - - typedef Diagonal DiagonalDynamicIndexReturnType; - typedef typename internal::add_const >::type ConstDiagonalDynamicIndexReturnType; + const Diagonal diagonal() const; EIGEN_DEVICE_FUNC - DiagonalDynamicIndexReturnType diagonal(Index index); + Diagonal diagonal(Index index); EIGEN_DEVICE_FUNC - ConstDiagonalDynamicIndexReturnType diagonal(Index index) const; + const Diagonal diagonal(Index index) const; template struct TriangularViewReturnType { typedef TriangularView Type; }; template struct ConstTriangularViewReturnType { typedef const TriangularView Type; }; diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h index fdd4d4f519f19e2302ad488eec6e38e955b31012..e232651386446a53e5390ed9811277cbf2932eb1 100644 --- a/Eigen/src/Core/NumTraits.h +++ b/Eigen/src/Core/NumTraits.h @@ -245,12 +245,25 @@ template<> struct NumTraits : GenericNumTraits static inline double dummy_precision() { return 1e-12; } }; +// GPU devices treat `long double` as `double`. +#ifndef EIGEN_GPU_COMPILE_PHASE template<> struct NumTraits : GenericNumTraits { - EIGEN_CONSTEXPR - static inline long double dummy_precision() { return 1e-15l; } + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline long double dummy_precision() { return static_cast(1e-15l); } + +#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106) + // PowerPC double double causes issues with some values + EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR + static inline long double epsilon() + { + // 2^(-(__LDBL_MANT_DIG__)+1) + return static_cast(2.4651903288156618919116517665087e-32l); + } +#endif }; +#endif template struct NumTraits > : GenericNumTraits > @@ -289,9 +302,9 @@ struct NumTraits > IsInteger = NumTraits::IsInteger, IsSigned = NumTraits::IsSigned, RequireInitialization = 1, - ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::ReadCost, - AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::AddCost, - MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits::MulCost + ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::ReadCost), + AddCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::AddCost), + MulCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits::MulCost) }; EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h index 0be694259fe4d43305550ea7741eadea3a3f4bb9..17c06f0783dcb97cc019e83620df49cd60815817 100644 --- a/Eigen/src/Core/PartialReduxEvaluator.h +++ b/Eigen/src/Core/PartialReduxEvaluator.h @@ -54,12 +54,17 @@ struct packetwise_redux_traits /* Value to be returned when size==0 , by default let's return 0 */ template EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const Func& ) { return pset1(0); } +PacketType packetwise_redux_empty_value(const Func& ) { + const typename unpacket_traits::type zero(0); + return pset1(zero); +} /* For products the default is 1 */ template EIGEN_DEVICE_FUNC -PacketType packetwise_redux_empty_value(const scalar_product_op& ) { return pset1(1); } +PacketType packetwise_redux_empty_value(const scalar_product_op& ) { + return pset1(Scalar(1)); +} /* Perform the actual reduction */ template > enum { CoeffReadCost = TraversalSize==Dynamic ? HugeCost : TraversalSize==0 ? 1 - : TraversalSize * evaluator::CoeffReadCost + int(CostOpType::value), + : int(TraversalSize) * int(evaluator::CoeffReadCost) + int(CostOpType::value), _ArgFlags = evaluator::Flags, diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h index 202ed7100656bbdfd171be342461423f65b325b7..e2ddbd1d522a283a5992a504f023eebc48a05670 100644 --- a/Eigen/src/Core/PlainObjectBase.h +++ b/Eigen/src/Core/PlainObjectBase.h @@ -1019,7 +1019,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(rows,cols); + Derived tmp(rows,cols); const Index common_rows = numext::mini(rows, _this.rows()); const Index common_cols = numext::mini(cols, _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); @@ -1054,7 +1054,7 @@ struct conservative_resize_like_impl else { // The storage order does not allow us to use reallocation. - typename Derived::PlainObject tmp(other); + Derived tmp(other); const Index common_rows = numext::mini(tmp.rows(), _this.rows()); const Index common_cols = numext::mini(tmp.cols(), _this.cols()); tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols); diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index b766e1a1d643bfe5b4e7b2a574f839cf37ad6883..8cf294b287bc9a458cd9b5f0bb4c03e4a5b51126 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -831,7 +831,7 @@ struct diagonal_product_evaluator_base typedef typename ScalarBinaryOpTraits::ReturnType Scalar; public: enum { - CoeffReadCost = NumTraits::MulCost + evaluator::CoeffReadCost + evaluator::CoeffReadCost, + CoeffReadCost = int(NumTraits::MulCost) + int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost), MatrixFlags = evaluator::Flags, DiagFlags = evaluator::Flags, diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 30598f4158decd1900e2c2f3667a6ff5396d4dd2..b6790d11050bb340c83acaa72197eba395c80222 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -58,7 +58,7 @@ public: public: enum { Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost - : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, + : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits::Cost, UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize)) }; @@ -331,7 +331,7 @@ struct redux_impl enum { PacketSize = redux_traits::PacketSize, Size = Evaluator::SizeAtCompileTime, - VectorizedSize = (Size / PacketSize) * PacketSize + VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize) }; template diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h index 52de73b6fc371b8cbd45e13599d7c49b790903f7..882314cfe70089b12a588fec41318cc3a75fc733 100644 --- a/Eigen/src/Core/Reshaped.h +++ b/Eigen/src/Core/Reshaped.h @@ -250,7 +250,7 @@ class ReshapedImpl_dense EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index outerStride() const { - return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows(); + return (((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride(); } protected: diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h index b7ed6f1cdcfbe5d0bc59dbbdf8270a03a20a9f83..8ce3b372a0ff792b98b8b1b00f1dd55742779bd0 100644 --- a/Eigen/src/Core/SelfAdjointView.h +++ b/Eigen/src/Core/SelfAdjointView.h @@ -66,7 +66,7 @@ template class SelfAdjointView enum { Mode = internal::traits::Mode, Flags = internal::traits::Flags, - TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0) + TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0) }; typedef typename MatrixType::PlainObject PlainObject; diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h index 38794447564d717dec9eb31e3845fe44787f6dd0..dfbf99523a9c9cb5328a20e7d0f80574a4ad2cbf 100644 --- a/Eigen/src/Core/SolveTriangular.h +++ b/Eigen/src/Core/SolveTriangular.h @@ -168,7 +168,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl::solveInPlace(c { OtherDerived& other = _other.const_cast_derived(); eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) ); - eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower))); + eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower)))); // If solving for a 0x0 matrix, nothing to do, simply return. if (derived().cols() == 0) return; diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h index 5014610420f3e8cea48f0b3fef0a345f1a920f8d..e38b3d5ad603389ae4036bc5de00b19aafff2d62 100644 --- a/Eigen/src/Core/SolverBase.h +++ b/Eigen/src/Core/SolverBase.h @@ -110,7 +110,7 @@ class SolverBase : public EigenBase } /** \internal the return type of transpose() */ - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; /** \returns an expression of the transposed of the factored matrix. * * A typical usage is to solve for the transposed problem A^T x = b: @@ -118,16 +118,16 @@ class SolverBase : public EigenBase * * \sa adjoint(), solve() */ - inline ConstTransposeReturnType transpose() const + inline const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); } /** \internal the return type of adjoint() */ typedef typename internal::conditional::IsComplex, - CwiseUnaryOp, ConstTransposeReturnType>, - ConstTransposeReturnType - >::type AdjointReturnType; + CwiseUnaryOp, const ConstTransposeReturnType>, + const ConstTransposeReturnType + >::type AdjointReturnType; /** \returns an expression of the adjoint of the factored matrix * * A typical usage is to solve for the adjoint problem A' x = b: @@ -137,7 +137,7 @@ class SolverBase : public EigenBase * * \sa transpose(), solve() */ - inline AdjointReturnType adjoint() const + inline const AdjointReturnType adjoint() const { return AdjointReturnType(derived().transpose()); } diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h index 6494d51420a9b3eb195218434ace8b9f9c83d8eb..d164e5399697c00baa9d1b4f9156b2ba3d785a24 100644 --- a/Eigen/src/Core/Stride.h +++ b/Eigen/src/Core/Stride.h @@ -38,10 +38,14 @@ namespace Eigen { * \include Map_general_stride.cpp * Output: \verbinclude Map_general_stride.out * - * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime + * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were * not allowed). * + * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1), + * the inner stride is the pointer increment between two consecutive elements, + * regardless of storage layout. + * * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders */ template diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h index 2bc658f40b88bd588d39457dc9b8f19deb0c14cf..741504d9571eef3c06029ddc1417332014331db5 100644 --- a/Eigen/src/Core/Transpose.h +++ b/Eigen/src/Core/Transpose.h @@ -178,7 +178,7 @@ template class TransposeImpl * \sa transposeInPlace(), adjoint() */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -Transpose +typename DenseBase::TransposeReturnType DenseBase::transpose() { return TransposeReturnType(derived()); @@ -191,7 +191,7 @@ DenseBase::transpose() * \sa transposeInPlace(), adjoint() */ template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE -typename DenseBase::ConstTransposeReturnType +const typename DenseBase::ConstTransposeReturnType DenseBase::transpose() const { return ConstTransposeReturnType(derived()); diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h index 779152fa7344116d410017f81183894543864888..fdb8bc15a5b2c9ea0c133b26856a0cd091e3127a 100644 --- a/Eigen/src/Core/TriangularMatrix.h +++ b/Eigen/src/Core/TriangularMatrix.h @@ -53,7 +53,7 @@ template class TriangularBase : public EigenBase typedef Derived const& Nested; EIGEN_DEVICE_FUNC - inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); } + inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); } EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); } @@ -819,7 +819,7 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con enum { unroll = DstXprType::SizeAtCompileTime != Dynamic && SrcEvaluatorType::CoeffReadCost < HugeCost - && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT + && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT }; triangular_assignment_loop::run(kernel); @@ -853,7 +853,7 @@ struct Assignment { EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) { - call_triangular_assignment_loop(dst, src, func); + call_triangular_assignment_loop(dst, src, func); } }; @@ -951,7 +951,7 @@ template EIGEN_DEVICE_FUNC void TriangularBase::evalToLazy(MatrixBase &other) const { other.derived().resize(this->rows(), this->cols()); - internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); + internal::call_triangular_assignment_loop(other.derived(), derived().nestedExpression()); } namespace internal { diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h index 07a2e42433e2ad1d21ba8d94ecb8394809df65b0..00bcca87768eba2f48d31759a14a3c42beb3f6b1 100644 --- a/Eigen/src/Core/Visitor.h +++ b/Eigen/src/Core/Visitor.h @@ -124,7 +124,7 @@ void DenseBase::visit(Visitor& visitor) const enum { unroll = SizeAtCompileTime != Dynamic - && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits::Cost <= EIGEN_UNROLLING_LIMIT + && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits::Cost) <= EIGEN_UNROLLING_LIMIT }; return internal::visitor_impl::run(thisEval, visitor); } diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h index 506ca0be57157d639cc8287486aae97c88e2c89e..e9096c0a1ace3804067013864b170700433c1336 100644 --- a/Eigen/src/Core/arch/AVX/Complex.h +++ b/Eigen/src/Core/arch/AVX/Complex.h @@ -99,7 +99,9 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cf pset1(const std::complex& from) { - return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from))); + const float re = std::real(from); + const float im = std::imag(from); + return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet4cf ploaddup(const std::complex* from) @@ -167,39 +169,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P Packet2cf(_mm256_extractf128_ps(a.v, 1)))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) @@ -350,39 +319,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd(_mm256_extractf128_pd(a.v,1)))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h index dd3f243d24ab3401b51aec8acca07de176c19478..24e01c46fa838a8ccc8e5dfbf4d94fbf03750395 100644 --- a/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/Eigen/src/Core/arch/AVX/PacketMath.h @@ -285,11 +285,13 @@ template<> EIGEN_STRONG_INLINE Packet8i psub(const Packet8i& a, const template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { - return _mm256_sub_ps(_mm256_set1_ps(0.0),a); + const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)); + return _mm256_xor_ps(a, mask); } template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) { - return _mm256_sub_pd(_mm256_set1_pd(0.0),a); + const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL)); + return _mm256_xor_pd(a, mask); } template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; } @@ -628,11 +630,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4d& template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); } template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from, uint8_t umask) { +#ifdef EIGEN_VECTORIZE_AVX512 + __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF); + EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from)); +#else Packet8i mask = _mm256_set1_epi8(static_cast(umask)); - const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe); + const Packet8i bit_mask = _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe); mask = por(mask, bit_mask); mask = pcmp_eq(mask, _mm256_set1_epi32(0xffffffff)); - EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from); +#if EIGEN_COMP_MSVC + // MSVC sometimes seems to use a bogus mask with maskstore. + const __m256i ifrom = _mm256_castps_si256(from); + EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0), reinterpret_cast(to)); + EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1), reinterpret_cast(to + 4)); +#else + EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from); +#endif +#endif } // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available @@ -1006,7 +1020,7 @@ EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { #ifdef EIGEN_HAS_FP16_C - return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); + return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT); #else EIGEN_ALIGN32 float aux[8]; pstore(aux, a); @@ -1274,12 +1288,7 @@ EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) { EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { Packet8bf r; - // Flush input denormals value to zero with hardware capability. - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); - __m256 flush = _mm256_and_ps(a, a); - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); - - __m256i input = _mm256_castps_si256(flush); + __m256i input = _mm256_castps_si256(a); #ifdef EIGEN_VECTORIZE_AVX2 // uint32_t lsb = (input >> 16); @@ -1293,7 +1302,7 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { // input = input >> 16; t = _mm256_srli_epi32(t, 16); // Check NaN before converting back to bf16 - __m256 mask = _mm256_cmp_ps(flush, flush, _CMP_ORD_Q); + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); __m256i nan = _mm256_set1_epi32(0x7fc0); t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask)); // output = numext::bit_cast(input); @@ -1316,7 +1325,7 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) { lo = _mm_srli_epi32(lo, 16); hi = _mm_srli_epi32(hi, 16); // Check NaN before converting back to bf16 - __m256 mask = _mm256_cmp_ps(flush, flush, _CMP_ORD_Q); + __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q); __m128i nan = _mm_set1_epi32(0x7fc0); lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask))); hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1))); diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h index 45f22f436f9ebaa7699cda57acd90555443a6fe7..0167d050e95434003d12bd1e56807d9d4fc63f0f 100644 --- a/Eigen/src/Core/arch/AVX512/Complex.h +++ b/Eigen/src/Core/arch/AVX512/Complex.h @@ -37,7 +37,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, - HasSqrt = 1, + HasSqrt = EIGEN_HAS_AVX512_MATH, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -97,7 +97,9 @@ template<> EIGEN_STRONG_INLINE Packet8cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet8cf pset1(const std::complex& from) { - return Packet8cf(_mm512_castpd_ps(pload1((const double*)(const void*)&from))); + const float re = std::real(from); + const float im = std::imag(from); + return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet8cf ploaddup(const std::complex* from) @@ -153,39 +155,6 @@ EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4(const Packet8cf& a) return Packet4cf(res); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f) template<> EIGEN_STRONG_INLINE Packet8cf pdiv(const Packet8cf& a, const Packet8cf& b) @@ -225,7 +194,7 @@ template<> struct packet_traits > : default_packet_traits HasMul = 1, HasDiv = 1, HasNegate = 1, - HasSqrt = 1, + HasSqrt = EIGEN_HAS_AVX512_MATH, HasAbs = 0, HasAbs2 = 0, HasMin = 0, @@ -286,11 +255,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd ploadu(const std::complex EIGEN_STRONG_INLINE Packet4cd pset1(const std::complex& from) { - #ifdef EIGEN_VECTORIZE_AVX512DQ - return Packet4cd(_mm512_broadcast_f64x2(pset1(from).v)); - #else return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1(from).v)))); - #endif } template<> EIGEN_STRONG_INLINE Packet4cd ploaddup(const std::complex* from) { @@ -441,6 +406,8 @@ ptranspose(PacketBlock& kernel) { kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0] } +#if EIGEN_HAS_AVX512_MATH + template<> EIGEN_STRONG_INLINE Packet4cd psqrt(const Packet4cd& a) { return psqrt_complex(a); } @@ -449,6 +416,8 @@ template<> EIGEN_STRONG_INLINE Packet8cf psqrt(const Packet8cf& a) { return psqrt_complex(a); } +#endif + } // end namespace internal } // end namespace Eigen diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h index 41929cb348b5d207dd9a80e47e2decac7f6a1596..017d6bff00ca99891d6ddb1f3c6a6b3e629e7265 100644 --- a/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -14,8 +14,7 @@ namespace Eigen { namespace internal { -// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics. -#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 +#if EIGEN_HAS_AVX512_MATH #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \ const Packet16f p16f_##NAME = pset1(X) @@ -119,74 +118,11 @@ pexp(const Packet16f& _x) { return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x); } -/*template <> +template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d pexp(const Packet8d& _x) { - Packet8d x = _x; - - _EIGEN_DECLARE_CONST_Packet8d(1, 1.0); - _EIGEN_DECLARE_CONST_Packet8d(2, 2.0); - - _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437); - _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0); - - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125); - _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6); - - // clamp x - x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo); - - // Express exp(x) as exp(g + n*log(2)). - const Packet8d n = - _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT); - - // Get the remainder modulo log(2), i.e. the "g" described above. Subtract - // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last - // digits right. - const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1); - const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2); - x = psub(x, nC1); - x = psub(x, nC2); - - const Packet8d x2 = pmul(x, x); - - // Evaluate the numerator polynomial of the rational interpolant. - Packet8d px = p8d_cephes_exp_p0; - px = pmadd(px, x2, p8d_cephes_exp_p1); - px = pmadd(px, x2, p8d_cephes_exp_p2); - px = pmul(px, x); - - // Evaluate the denominator polynomial of the rational interpolant. - Packet8d qx = p8d_cephes_exp_q0; - qx = pmadd(qx, x2, p8d_cephes_exp_q1); - qx = pmadd(qx, x2, p8d_cephes_exp_q2); - qx = pmadd(qx, x2, p8d_cephes_exp_q3); - - // I don't really get this bit, copied from the SSE2 routines, so... - // TODO(gonnet): Figure out what is going on here, perhaps find a better - // rational interpolant? - x = _mm512_div_pd(px, psub(qx, px)); - x = pmadd(p8d_2, x, p8d_1); - - // Build e=2^n. - const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64( - _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52)); - - // Construct the result 2^n * exp(g) = e * x. The max is used to catch - // non-finite values in the input. - return pmax(pmul(x, e), _x); - }*/ + return pexp_double(_x); +} F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp) @@ -389,7 +325,7 @@ Packet16f pexpm1(const Packet16f& _x) { F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1) -#endif +#endif // EIGEN_HAS_AVX512_MATH template <> diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h index f8741372d9aa47a01e2320911b8ae7913194b797..4ab100cec1815cf29041cfe7f251cc83a2960ad3 100644 --- a/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -28,6 +28,13 @@ namespace internal { #endif #endif +// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics. +#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900 +#define EIGEN_HAS_AVX512_MATH 1 +#else +#define EIGEN_HAS_AVX512_MATH 0 +#endif + typedef __m512 Packet16f; typedef __m512i Packet16i; typedef __m512d Packet8d; @@ -72,12 +79,14 @@ struct packet_traits : default_packet_traits { HasMax = 1, HasConj = 1, HasSetLinear = 0, - HasLog = 1, - HasLog1p = 1, - HasExpm1 = 1, - HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, + HasLog = EIGEN_HAS_AVX512_MATH, + HasLog1p = EIGEN_HAS_AVX512_MATH, + HasExp = EIGEN_HAS_AVX512_MATH, + HasExpm1 = EIGEN_HAS_AVX512_MATH, + HasSqrt = EIGEN_HAS_AVX512_MATH, + HasRsqrt = EIGEN_HAS_AVX512_MATH, + HasBessel = EIGEN_HAS_AVX512_MATH, + HasNdtri = EIGEN_HAS_AVX512_MATH, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, HasTanh = EIGEN_FAST_MATH, @@ -86,9 +95,7 @@ struct packet_traits : default_packet_traits { HasRound = 1, HasFloor = 1, HasCeil = 1, - HasRint = 1, - HasBessel = 1, - HasNdtri = 1 + HasRint = 1 }; }; @@ -109,7 +116,7 @@ template<> struct packet_traits : default_packet_traits HasBlend = 0, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH HasLog = 1, HasLog1p = 1, HasExpm1 = 1, @@ -138,8 +145,9 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH HasLog = 1, + HasExp = 1, HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, #endif @@ -288,11 +296,20 @@ EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, template <> EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { - return _mm512_sub_ps(_mm512_set1_ps(0.0), a); + // NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results. + // The intel docs give it a relatively high latency as well, so we're probably + // better off with using _mm512_set_epi32 directly anyways. + const __m512i mask = _mm512_set_epi32(0x80000000,0x80000000,0x80000000,0x80000000, + 0x80000000,0x80000000,0x80000000,0x80000000, + 0x80000000,0x80000000,0x80000000,0x80000000, + 0x80000000,0x80000000,0x80000000,0x80000000); + return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask)); } template <> EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) { - return _mm512_sub_pd(_mm512_set1_pd(0.0), a); + const __m512i mask = _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, + 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL); + return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask)); } template <> @@ -486,7 +503,7 @@ template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packe } template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) { - __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ); + __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ); return _mm512_castsi512_ps( _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu)); } @@ -517,7 +534,7 @@ EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) { } template <> EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) { - __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGT_UQ); + __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ); return _mm512_castsi512_pd( _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu)); } @@ -685,7 +702,7 @@ EIGEN_STRONG_INLINE Packet8d pload(const double* from) { template <> EIGEN_STRONG_INLINE Packet16i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512( - reinterpret_cast(from)); + reinterpret_cast(from)); } template <> @@ -929,7 +946,8 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons Packet8i b = parithmetic_shift_right<2>(e); // floor(e/4) // 2^b - Packet8i hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0)); + const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); Packet8i lo = _mm256_slli_epi64(hi, 52); hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); @@ -937,7 +955,7 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp(const Packet8d& a, cons // 2^(e - 3b) b = psub(psub(psub(e, b), b), b); // e - 3b - hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0)); + hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx); lo = _mm256_slli_epi64(hi, 52); hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52); c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1)); @@ -1424,60 +1442,11 @@ ploadquad(const Eigen::half* from) { } EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { -#ifdef EIGEN_HAS_FP16_C return _mm512_cvtph_ps(a); -#else - EIGEN_ALIGN64 half aux[16]; - pstore(aux, a); - float f0(aux[0]); - float f1(aux[1]); - float f2(aux[2]); - float f3(aux[3]); - float f4(aux[4]); - float f5(aux[5]); - float f6(aux[6]); - float f7(aux[7]); - float f8(aux[8]); - float f9(aux[9]); - float fa(aux[10]); - float fb(aux[11]); - float fc(aux[12]); - float fd(aux[13]); - float fe(aux[14]); - float ff(aux[15]); - - return _mm512_set_ps( - ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0); -#endif } EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { -#ifdef EIGEN_HAS_FP16_C return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC); -#else - EIGEN_ALIGN64 float aux[16]; - pstore(aux, a); - half h0(aux[0]); - half h1(aux[1]); - half h2(aux[2]); - half h3(aux[3]); - half h4(aux[4]); - half h5(aux[5]); - half h6(aux[6]); - half h7(aux[7]); - half h8(aux[8]); - half h9(aux[9]); - half ha(aux[10]); - half hb(aux[11]); - half hc(aux[12]); - half hd(aux[13]); - half he(aux[14]); - half hf(aux[15]); - - return _mm256_set_epi16( - hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x, - h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x); -#endif } template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) { @@ -1850,7 +1819,7 @@ struct packet_traits : default_packet_traits { HasInsert = 1, HasSin = EIGEN_FAST_MATH, HasCos = EIGEN_FAST_MATH, -#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) +#if EIGEN_HAS_AVX512_MATH #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, // Currently fails test with bad accuracy. HasLog1p = 1, @@ -1943,23 +1912,15 @@ EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) { EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) { Packet16bf r; - // Flush input denormals value to zero with hardware capability. - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); -#if defined(EIGEN_VECTORIZE_AVX512DQ) - __m512 flush = _mm512_and_ps(a, a); -#else - __m512 flush = _mm512_max_ps(a, a); -#endif // EIGEN_VECTORIZE_AVX512DQ - _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF); - #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1) // Since GCC 10.1 supports avx512bf16 and C style explicit cast // (C++ static_cast is not supported yet), do converion via intrinsic // and register path for performance. - r = (__m256i)(_mm512_cvtneps_pbh(flush)); + r = (__m256i)(_mm512_cvtneps_pbh(a)); + #else __m512i t; - __m512i input = _mm512_castps_si512(flush); + __m512i input = _mm512_castps_si512(a); __m512i nan = _mm512_set1_epi32(0x7fc0); // uint32_t lsb = (input >> 16) & 1; @@ -1972,9 +1933,9 @@ EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) { t = _mm512_srli_epi32(t, 16); // Check NaN before converting back to bf16 - __mmask16 mask = _mm512_cmp_ps_mask(flush, flush, _CMP_ORD_Q); - t = _mm512_mask_blend_epi32(mask, nan, t); + __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q); + t = _mm512_mask_blend_epi32(mask, nan, t); // output.value = static_cast(input); r = _mm512_cvtepi32_epi16(t); #endif // EIGEN_VECTORIZE_AVX512BF16 diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h index c6cb59e8f1c3ec4dcb37d0fd1cdb4c4428ab1c93..b3932998c1bc6b7383dc99ef75049409b6ee3c0d 100644 --- a/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/Eigen/src/Core/arch/AltiVec/Complex.h @@ -74,7 +74,7 @@ struct Packet2cf return Packet2cf(*this) -= b; } EIGEN_STRONG_INLINE Packet2cf operator-(void) const { - return Packet2cf(vec_neg(v)); + return Packet2cf(-v); } Packet4f v; @@ -127,20 +127,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet2cf& from) { pstore((float*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet2cf& from) { pstoreu((float*)to, from.v); } -EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex* from0, const std::complex* from1) +EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex& from0, const std::complex& from1) { Packet4f res0, res1; #ifdef __VSX__ - __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0)); - __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1)); + __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0)); + __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1)); #ifdef _BIG_ENDIAN __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); #else __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1)); #endif #else - *reinterpret_cast *>(&res0) = *from0; - *reinterpret_cast *>(&res1) = *from1; + *reinterpret_cast *>(&res0) = from0; + *reinterpret_cast *>(&res1) = from1; res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI); #endif return Packet2cf(res0); @@ -206,45 +206,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(prod); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for AltiVec - Packet2cf res = conj_helper().pmul(a, b); + Packet2cf res = pmul(a, pconj(b)); Packet4f s = pmul(b.v, b.v); return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); } @@ -327,7 +294,7 @@ struct Packet1cd return Packet1cd(*this) -= b; } EIGEN_STRONG_INLINE Packet1cd operator-(void) const { - return Packet1cd(vec_neg(v)); + return Packet1cd(-v); } Packet2d v; @@ -404,45 +371,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = pmul(b.v, b.v); return Packet1cd(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_REVERSE64)))); } diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h index 3a7a329361eea0cd6ba25b81d663a8409ed70384..2b7c204e3fd37230c62b3dacf261e47313ff6d8d 100644 --- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h +++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h @@ -40,24 +40,9 @@ Packet4f pcos(const Packet4f& _x) return pcos_float(_x); } -#ifndef EIGEN_COMP_CLANG -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet4f prsqrt(const Packet4f& x) -{ - return vec_rsqrt(x); -} -#endif - #ifdef __VSX__ -#ifndef EIGEN_COMP_CLANG -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED -Packet2d prsqrt(const Packet2d& x) -{ - return vec_rsqrt(x); -} -#endif -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt(const Packet4f& x) { return vec_sqrt(x); @@ -69,12 +54,41 @@ Packet2d psqrt(const Packet2d& x) return vec_sqrt(x); } -template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet4f prsqrt(const Packet4f& x) +{ + return pset1(1.0f) / psqrt(x); +// vec_rsqrt returns different results from the generic version +// return vec_rsqrt(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS +Packet2d prsqrt(const Packet2d& x) +{ + return pset1(1.0) / psqrt(x); +// vec_rsqrt returns different results from the generic version +// return vec_rsqrt(x); +} + +template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp(const Packet2d& _x) { return pexp_double(_x); } -#endif + +template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); +} + +template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ + BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); +} + +#endif // __VSX__ // Hyperbolic Tangent function. template <> diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h index e3ba06159ff2246f0d7ca3bdc328b828c0526f37..ea7749610d7ccfee948e9b111476c60db9b6a7cc 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h @@ -11,26 +11,41 @@ #ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H +#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK +#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1 +#endif + #include "MatrixProductCommon.h" -// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX -#if EIGEN_COMP_LLVM -#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY) -#ifdef __MMA__ -#define EIGEN_ALTIVEC_MMA_ONLY -#else -#define EIGEN_ALTIVEC_DISABLE_MMA +#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) +#define EIGEN_ALTIVEC_DISABLE_MMA 0 #endif + +// Check for MMA builtin support. +#if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin) +#if __has_builtin(__builtin_mma_assemble_acc) + #define EIGEN_ALTIVEC_MMA_SUPPORT #endif #endif -#ifdef __has_builtin -#if __has_builtin(__builtin_mma_assemble_acc) - #define ALTIVEC_MMA_SUPPORT +// Check if and how we should actually use MMA if supported. +#if defined(EIGEN_ALTIVEC_MMA_SUPPORT) + +#if !defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH) +#define EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH 0 #endif + +// Check if we want to enable dynamic dispatch. Not supported by LLVM. +#if EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH && !EIGEN_COMP_LLVM +#define EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH 1 +// Otherwise, use MMA by default if available. +#elif defined(__MMA__) +#define EIGEN_ALTIVEC_MMA_ONLY 1 #endif -#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) +#endif // EIGEN_ALTIVEC_MMA_SUPPORT + +#if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) #include "MatrixProductMMA.h" #endif @@ -113,7 +128,7 @@ const static Packet16uc p16uc_GETIMAG64 = { 8, 9, 10, 11, 12, 13, 14, 15, * float32/64 and complex float32/64 version. **/ template -EIGEN_STRONG_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) +EIGEN_ALWAYS_INLINE std::complex getAdjointVal(Index i, Index j, const_blas_data_mapper, Index, StorageOrder>& dt) { std::complex v; if(i < j) @@ -160,24 +175,23 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex* bloc rir += vectorDelta; } - if (j < cols) + + for(; j < cols; j++) { - rii = rir + ((cols - j) * rows); + rii = rir + rows; for(Index i = k2; i < depth; i++) { - Index k = j; - for(; k < cols; k++) - { - std::complex v = getAdjointVal(i, k, rhs); + std::complex v = getAdjointVal(i, j, rhs); - blockBf[rir] = v.real(); - blockBf[rii] = v.imag(); + blockBf[rir] = v.real(); + blockBf[rii] = v.imag(); - rir += 1; - rii += 1; - } + rir += 1; + rii += 1; } + + rir += rows; } } @@ -256,19 +270,15 @@ EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs } } - if (j < cols) + for(; j < cols; j++) { for(Index i = k2; i < depth; i++) { - Index k = j; - for(; k < cols; k++) - { - if(k <= i) - blockB[ri] = rhs(i, k); - else - blockB[ri] = rhs(k, i); - ri += 1; - } + if(j <= i) + blockB[ri] = rhs(i, j); + else + blockB[ri] = rhs(j, i); + ri += 1; } } } @@ -402,22 +412,18 @@ struct symm_pack_lhs * and offset and behaves accordingly. **/ -template -EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) -{ - const Index size = 16 / sizeof(Scalar); - pstore(to + (0 * size), block.packet[0]); - pstore(to + (1 * size), block.packet[1]); - pstore(to + (2 * size), block.packet[2]); - pstore(to + (3 * size), block.packet[3]); -} - -template -EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock& block) +template +EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock& block) { const Index size = 16 / sizeof(Scalar); pstore(to + (0 * size), block.packet[0]); pstore(to + (1 * size), block.packet[1]); + if (N > 2) { + pstore(to + (2 * size), block.packet[2]); + } + if (N > 3) { + pstore(to + (3 * size), block.packet[3]); + } } // General template for lhs & rhs complex packing. @@ -443,9 +449,9 @@ struct dhs_cpack { PacketBlock cblock; if (UseLhs) { - bload(cblock, lhs, j, i); + bload(cblock, lhs, j, i); } else { - bload(cblock, lhs, i, j); + bload(cblock, lhs, i, j); } blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32); @@ -472,8 +478,8 @@ struct dhs_cpack { ptranspose(blocki); } - storeBlock(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 4*vectorSize; rii += 4*vectorSize; @@ -493,21 +499,12 @@ struct dhs_cpack { cblock.packet[1] = lhs.template loadPacket(i, j + 2); } } else { - const std::complex *lhs0, *lhs1; if (UseLhs) { - lhs0 = &lhs(j + 0, i); - lhs1 = &lhs(j + 1, i); - cblock.packet[0] = pload2(lhs0, lhs1); - lhs0 = &lhs(j + 2, i); - lhs1 = &lhs(j + 3, i); - cblock.packet[1] = pload2(lhs0, lhs1); + cblock.packet[0] = pload2(lhs(j + 0, i), lhs(j + 1, i)); + cblock.packet[1] = pload2(lhs(j + 2, i), lhs(j + 3, i)); } else { - lhs0 = &lhs(i, j + 0); - lhs1 = &lhs(i, j + 1); - cblock.packet[0] = pload2(lhs0, lhs1); - lhs0 = &lhs(i, j + 2); - lhs1 = &lhs(i, j + 3); - cblock.packet[1] = pload2(lhs0, lhs1); + cblock.packet[0] = pload2(lhs(i, j + 0), lhs(i, j + 1)); + cblock.packet[1] = pload2(lhs(i, j + 2), lhs(i, j + 3)); } } @@ -529,34 +526,50 @@ struct dhs_cpack { rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta); } - if (j < rows) + if (!UseLhs) { - if(PanelMode) rir += (offset*(rows - j - vectorSize)); - rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + if(PanelMode) rir -= (offset*(vectorSize - 1)); - for(Index i = 0; i < depth; i++) + for(; j < rows; j++) { - Index k = j; - for(; k < rows; k++) + rii = rir + ((PanelMode) ? stride : depth); + + for(Index i = 0; i < depth; i++) { - if (UseLhs) { + blockAt[rir] = lhs(i, j).real(); + + if(Conjugate) + blockAt[rii] = -lhs(i, j).imag(); + else + blockAt[rii] = lhs(i, j).imag(); + + rir += 1; + rii += 1; + } + + rir += ((PanelMode) ? (2*stride - depth) : depth); + } + } else { + if (j < rows) + { + if(PanelMode) rir += (offset*(rows - j - vectorSize)); + rii = rir + (((PanelMode) ? stride : depth) * (rows - j)); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { blockAt[rir] = lhs(k, i).real(); if(Conjugate) blockAt[rii] = -lhs(k, i).imag(); else blockAt[rii] = lhs(k, i).imag(); - } else { - blockAt[rir] = lhs(i, k).real(); - if(Conjugate) - blockAt[rii] = -lhs(i, k).imag(); - else - blockAt[rii] = lhs(i, k).imag(); + rir += 1; + rii += 1; } - - rir += 1; - rii += 1; } } } @@ -582,16 +595,16 @@ struct dhs_pack{ PacketBlock block; if (UseLhs) { - bload(block, lhs, j, i); + bload(block, lhs, j, i); } else { - bload(block, lhs, i, j); + bload(block, lhs, i, j); } if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) { ptranspose(block); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 4*vectorSize; } @@ -626,21 +639,33 @@ struct dhs_pack{ if(PanelMode) ri += vectorSize*(stride - offset - depth); } - if (j < rows) + if (!UseLhs) { - if(PanelMode) ri += offset*(rows - j); + if(PanelMode) ri += offset; - for(Index i = 0; i < depth; i++) + for(; j < rows; j++) { - Index k = j; - for(; k < rows; k++) + for(Index i = 0; i < depth; i++) { - if (UseLhs) { + blockA[ri] = lhs(i, j); + ri += 1; + } + + if(PanelMode) ri += stride - depth; + } + } else { + if (j < rows) + { + if(PanelMode) ri += offset*(rows - j); + + for(Index i = 0; i < depth; i++) + { + Index k = j; + for(; k < rows; k++) + { blockA[ri] = lhs(k, i); - } else { - blockA[ri] = lhs(i, k); + ri += 1; } - ri += 1; } } } @@ -676,7 +701,7 @@ struct dhs_pack(j, i + 1); } - storeBlock(blockA + ri, block); + storeBlock(blockA + ri, block); ri += 2*vectorSize; } @@ -753,7 +778,7 @@ struct dhs_pack(i + 1, j + 0); //[b1 b2] block.packet[3] = rhs.template loadPacket(i + 1, j + 2); //[b3 b4] - storeBlock(blockB + ri, block); + storeBlock(blockB + ri, block); } ri += 4*vectorSize; @@ -784,19 +809,17 @@ struct dhs_pack(blockAt + rir, blockr); - storeBlock(blockAt + rii, blocki); + storeBlock(blockAt + rir, blockr); + storeBlock(blockAt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -937,7 +960,7 @@ struct dhs_cpack cblock; PacketBlock blockr, blocki; - bload(cblock, rhs, i, j); + bload(cblock, rhs, i, j); blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64); blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64); @@ -951,8 +974,8 @@ struct dhs_cpack(blockBt + rir, blockr); - storeBlock(blockBt + rii, blocki); + storeBlock(blockBt + rir, blockr); + storeBlock(blockBt + rii, blocki); rir += 2*vectorSize; rii += 2*vectorSize; @@ -961,27 +984,26 @@ struct dhs_cpack -EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) -{ - if(NegativeAccumulate) - { - acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); - acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); - acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); - acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); - } else { - acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); - acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); - acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); - acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); - } -} - -template -EIGEN_STRONG_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) +template +EIGEN_ALWAYS_INLINE void pger_common(PacketBlock* acc, const Packet& lhsV, const Packet* rhsV) { if(NegativeAccumulate) { acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]); + if (N > 1) { + acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]); + } + if (N > 2) { + acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]); + } + if (N > 3) { + acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]); + } } else { acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]); + if (N > 1) { + acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]); + } + if (N > 2) { + acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]); + } + if (N > 3) { + acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]); + } } } template -EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) { Packet lhsV = pload(lhs); - pger_common(acc, lhsV, rhsV); + pger_common(acc, lhsV, rhsV); } -template -EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar)); @@ -1040,37 +1063,37 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, In #endif } -template -EIGEN_STRONG_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void pger(PacketBlock* acc, const Scalar* lhs, const Packet* rhsV) { Packet lhsV; - loadPacketRemaining(lhs, lhsV, remaining_rows); + loadPacketRemaining(lhs, lhsV); - pger_common(acc, lhsV, rhsV); + pger_common(acc, lhsV, rhsV); } // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real. template -EIGEN_STRONG_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) +EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock* accReal, PacketBlock* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi) { - pger_common(accReal, lhsV, rhsV); + pger_common(accReal, lhsV, rhsV); if(LhsIsReal) { - pger_common(accImag, lhsV, rhsVi); + pger_common(accImag, lhsV, rhsVi); EIGEN_UNUSED_VARIABLE(lhsVi); } else { if (!RhsIsReal) { - pger_common(accReal, lhsVi, rhsVi); - pger_common(accImag, lhsV, rhsVi); + pger_common(accReal, lhsVi, rhsVi); + pger_common(accImag, lhsV, rhsVi); } else { EIGEN_UNUSED_VARIABLE(rhsVi); } - pger_common(accImag, lhsVi, rhsV); + pger_common(accImag, lhsVi, rhsV); } } template -EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) { Packet lhsV = ploadLhs(lhs_ptr); Packet lhsVi; @@ -1080,8 +1103,8 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); } -template -EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi) { #ifdef _ARCH_PWR9 lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar)); @@ -1097,148 +1120,158 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar #endif } -template -EIGEN_STRONG_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows) +template +EIGEN_ALWAYS_INLINE void pgerc(PacketBlock* accReal, PacketBlock* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) { Packet lhsV, lhsVi; - loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows); + loadPacketRemaining(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi); pgerc_common(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi); } template -EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs) +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs) { - return *reinterpret_cast(const_cast(lhs)); + return ploadu(lhs); } // Zero the accumulator on PacketBlock. -template -EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) -{ - acc.packet[0] = pset1((Scalar)0); - acc.packet[1] = pset1((Scalar)0); - acc.packet[2] = pset1((Scalar)0); - acc.packet[3] = pset1((Scalar)0); -} - -template -EIGEN_STRONG_INLINE void bsetzero(PacketBlock& acc) +template +EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock& acc) { acc.packet[0] = pset1((Scalar)0); + if (N > 1) { + acc.packet[1] = pset1((Scalar)0); + } + if (N > 2) { + acc.packet[2] = pset1((Scalar)0); + } + if (N > 3) { + acc.packet[3] = pset1((Scalar)0); + } } // Scale the PacketBlock vectors by alpha. -template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) -{ - acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); - acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); - acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); - acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); -} - -template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]); + if (N > 1) { + acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]); + } + if (N > 2) { + acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]); + } + if (N > 3) { + acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]); + } } -template -EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) -{ - acc.packet[0] = pmul(accZ.packet[0], pAlpha); - acc.packet[1] = pmul(accZ.packet[1], pAlpha); - acc.packet[2] = pmul(accZ.packet[2], pAlpha); - acc.packet[3] = pmul(accZ.packet[3], pAlpha); -} - -template -EIGEN_STRONG_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) +template +EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha) { acc.packet[0] = pmul(accZ.packet[0], pAlpha); + if (N > 1) { + acc.packet[1] = pmul(accZ.packet[1], pAlpha); + } + if (N > 2) { + acc.packet[2] = pmul(accZ.packet[2], pAlpha); + } + if (N > 3) { + acc.packet[3] = pmul(accZ.packet[3], pAlpha); + } } // Complex version of PacketBlock scaling. template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag) { - bscalec_common(cReal, aReal, bReal); + bscalec_common(cReal, aReal, bReal); - bscalec_common(cImag, aImag, bReal); + bscalec_common(cImag, aImag, bReal); - pger_common(&cReal, bImag, aImag.packet); + pger_common(&cReal, bImag, aImag.packet); - pger_common(&cImag, bImag, aReal.packet); + pger_common(&cImag, bImag, aReal.packet); } -template -EIGEN_STRONG_INLINE void band(PacketBlock& acc, const Packet& pMask) +template +EIGEN_ALWAYS_INLINE void band(PacketBlock& acc, const Packet& pMask) { acc.packet[0] = pand(acc.packet[0], pMask); - acc.packet[1] = pand(acc.packet[1], pMask); - acc.packet[2] = pand(acc.packet[2], pMask); - acc.packet[3] = pand(acc.packet[3], pMask); + if (N > 1) { + acc.packet[1] = pand(acc.packet[1], pMask); + } + if (N > 2) { + acc.packet[2] = pand(acc.packet[2], pMask); + } + if (N > 3) { + acc.packet[3] = pand(acc.packet[3], pMask); + } } -template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag, const Packet& pMask) { - band(aReal, pMask); - band(aImag, pMask); + band(aReal, pMask); + band(aImag, pMask); - bscalec(aReal, aImag, bReal, bImag, cReal, cImag); + bscalec(aReal, aImag, bReal, bImag, cReal, cImag); } // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed. -template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) -{ - if (StorageOrder == RowMajor) { - acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); - acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); - acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); - acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); - } else { - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); - acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); - acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); - } -} - -// An overload of bload when you have a PacketBLock with 8 vectors. -template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) { if (StorageOrder == RowMajor) { - acc.packet[0] = res.template loadPacket(row + 0, col + N*accCols); - acc.packet[1] = res.template loadPacket(row + 1, col + N*accCols); - acc.packet[2] = res.template loadPacket(row + 2, col + N*accCols); - acc.packet[3] = res.template loadPacket(row + 3, col + N*accCols); - acc.packet[4] = res.template loadPacket(row + 0, col + (N+1)*accCols); - acc.packet[5] = res.template loadPacket(row + 1, col + (N+1)*accCols); - acc.packet[6] = res.template loadPacket(row + 2, col + (N+1)*accCols); - acc.packet[7] = res.template loadPacket(row + 3, col + (N+1)*accCols); + acc.packet[0] = res.template loadPacket(row + 0, col); + if (N > 1) { + acc.packet[1] = res.template loadPacket(row + 1, col); + } + if (N > 2) { + acc.packet[2] = res.template loadPacket(row + 2, col); + } + if (N > 3) { + acc.packet[3] = res.template loadPacket(row + 3, col); + } + if (Complex) { + acc.packet[0+N] = res.template loadPacket(row + 0, col + accCols); + if (N > 1) { + acc.packet[1+N] = res.template loadPacket(row + 1, col + accCols); + } + if (N > 2) { + acc.packet[2+N] = res.template loadPacket(row + 2, col + accCols); + } + if (N > 3) { + acc.packet[3+N] = res.template loadPacket(row + 3, col + accCols); + } + } } else { - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + N*accCols, col + 1); - acc.packet[2] = res.template loadPacket(row + N*accCols, col + 2); - acc.packet[3] = res.template loadPacket(row + N*accCols, col + 3); - acc.packet[4] = res.template loadPacket(row + (N+1)*accCols, col + 0); - acc.packet[5] = res.template loadPacket(row + (N+1)*accCols, col + 1); - acc.packet[6] = res.template loadPacket(row + (N+1)*accCols, col + 2); - acc.packet[7] = res.template loadPacket(row + (N+1)*accCols, col + 3); + acc.packet[0] = res.template loadPacket(row, col + 0); + if (N > 1) { + acc.packet[1] = res.template loadPacket(row, col + 1); + } + if (N > 2) { + acc.packet[2] = res.template loadPacket(row, col + 2); + } + if (N > 3) { + acc.packet[3] = res.template loadPacket(row, col + 3); + } + if (Complex) { + acc.packet[0+N] = res.template loadPacket(row + accCols, col + 0); + if (N > 1) { + acc.packet[1+N] = res.template loadPacket(row + accCols, col + 1); + } + if (N > 2) { + acc.packet[2+N] = res.template loadPacket(row + accCols, col + 2); + } + if (N > 3) { + acc.packet[3+N] = res.template loadPacket(row + accCols, col + 3); + } + } } } -template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col) -{ - acc.packet[0] = res.template loadPacket(row + N*accCols, col + 0); - acc.packet[1] = res.template loadPacket(row + (N+1)*accCols, col + 0); -} - const static Packet4i mask41 = { -1, 0, 0, 0 }; const static Packet4i mask42 = { -1, -1, 0, 0 }; const static Packet4i mask43 = { -1, -1, -1, 0 }; @@ -1246,7 +1279,7 @@ const static Packet4i mask43 = { -1, -1, -1, 0 }; const static Packet2l mask21 = { -1, 0 }; template -EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows) +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(float(0.0)); // Not used @@ -1260,7 +1293,7 @@ EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows) } template<> -EIGEN_STRONG_INLINE Packet2d bmask(const int remaining_rows) +EIGEN_ALWAYS_INLINE Packet2d bmask(const int remaining_rows) { if (remaining_rows == 0) { return pset1(double(0.0)); // Not used @@ -1269,22 +1302,44 @@ EIGEN_STRONG_INLINE Packet2d bmask(const int remaining_rows) } } -template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) +template +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha, const Packet& pMask) { - band(accZ, pMask); + band(accZ, pMask); - bscale(acc, accZ, pAlpha); + bscale(acc, accZ, pAlpha); } -template -EIGEN_STRONG_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3) +template EIGEN_ALWAYS_INLINE void +pbroadcastN_old(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) +{ + a0 = pset1(a[0]); + if (N > 1) { + a1 = pset1(a[1]); + } else { + EIGEN_UNUSED_VARIABLE(a1); + } + if (N > 2) { + a2 = pset1(a[2]); + } else { + EIGEN_UNUSED_VARIABLE(a2); + } + if (N > 3) { + a3 = pset1(a[3]); + } else { + EIGEN_UNUSED_VARIABLE(a3); + } +} + +template<> +EIGEN_ALWAYS_INLINE void pbroadcastN_old(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - pbroadcast4(a, a0, a1, a2, a3); + pbroadcast4(a, a0, a1, a2, a3); } template<> -EIGEN_STRONG_INLINE void pbroadcast4_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) +EIGEN_ALWAYS_INLINE void pbroadcastN_old(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) { a1 = pload(a); a3 = pload(a + 2); @@ -1294,89 +1349,96 @@ EIGEN_STRONG_INLINE void pbroadcast4_old(const double* a, Packet2d& a0 a3 = vec_splat(a3, 1); } -// PEEL loop factor. -#define PEEL 7 - -template -EIGEN_STRONG_INLINE void MICRO_EXTRA_COL( - const Scalar* &lhs_ptr, - const Scalar* &rhs_ptr, - PacketBlock &accZero, - Index remaining_rows, - Index remaining_cols) +template EIGEN_ALWAYS_INLINE void +pbroadcastN(const __UNPACK_TYPE__(Packet) *a, + Packet& a0, Packet& a1, Packet& a2, Packet& a3) { - Packet rhsV[1]; - rhsV[0] = pset1(rhs_ptr[0]); - pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); - lhs_ptr += remaining_rows; - rhs_ptr += remaining_cols; + a0 = pset1(a[0]); + if (N > 1) { + a1 = pset1(a[1]); + } else { + EIGEN_UNUSED_VARIABLE(a1); + } + if (N > 2) { + a2 = pset1(a[2]); + } else { + EIGEN_UNUSED_VARIABLE(a2); + } + if (N > 3) { + a3 = pset1(a[3]); + } else { + EIGEN_UNUSED_VARIABLE(a3); + } } -template -EIGEN_STRONG_INLINE void gemm_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlpha) +template<> EIGEN_ALWAYS_INLINE void +pbroadcastN(const float *a, + Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) { - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; - PacketBlock accZero; + a3 = pload(a); + a0 = vec_splat(a3, 0); + a1 = vec_splat(a3, 1); + a2 = vec_splat(a3, 2); + a3 = vec_splat(a3, 3); +} - bsetzero(accZero); +// PEEL loop factor. +#define PEEL 7 +#define PEEL_ROW 7 - Index remaining_depth = (depth & -accRows); - Index k = 0; - for(; k + PEEL <= remaining_depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - EIGEN_POWER_PREFETCH(lhs_ptr); - for (int l = 0; l < PEEL; l++) { - MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); - } - } - for(; k < remaining_depth; k++) - { - MICRO_EXTRA_COL(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols); +#define MICRO_UNROLL_PEEL(func) \ + func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) + +#define MICRO_ZERO_PEEL(peel) \ + if ((PEEL_ROW > peel) && (peel != 0)) { \ + bsetzero(accZero##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accZero##peel); \ } - for(; k < depth; k++) - { - Packet rhsV[1]; - rhsV[0] = pset1(rhs_ptr[0]); - pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); - lhs_ptr += remaining_rows; - rhs_ptr += remaining_cols; + +#define MICRO_ZERO_PEEL_ROW \ + MICRO_UNROLL_PEEL(MICRO_ZERO_PEEL); + +#define MICRO_WORK_PEEL(peel) \ + if (PEEL_ROW > peel) { \ + pbroadcastN(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + pger(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ } - accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]); - for(Index i = 0; i < remaining_rows; i++) { - res(row + i, col) += accZero.packet[0][i]; +#define MICRO_WORK_PEEL_ROW \ + Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \ + MICRO_UNROLL_PEEL(MICRO_WORK_PEEL); \ + lhs_ptr += (remaining_rows * PEEL_ROW); \ + rhs_ptr += (accRows * PEEL_ROW); + +#define MICRO_ADD_PEEL(peel, sum) \ + if (PEEL_ROW > peel) { \ + for (Index i = 0; i < accRows; i++) { \ + accZero##sum.packet[i] += accZero##peel.packet[i]; \ + } \ } -} -template -EIGEN_STRONG_INLINE void MICRO_EXTRA_ROW( +#define MICRO_ADD_PEEL_ROW \ + MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \ + MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0) + +template +EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW( const Scalar* &lhs_ptr, const Scalar* &rhs_ptr, - PacketBlock &accZero, - Index remaining_rows) + PacketBlock &accZero) { Packet rhsV[4]; - pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV); + pbroadcastN(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger(&accZero, lhs_ptr, rhsV); lhs_ptr += remaining_rows; rhs_ptr += accRows; } -template -EIGEN_STRONG_INLINE void gemm_extra_row( +template +EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -1387,105 +1449,125 @@ EIGEN_STRONG_INLINE void gemm_extra_row( Index col, Index rows, Index cols, - Index remaining_rows, const Packet& pAlpha, const Packet& pMask) { const Scalar* rhs_ptr = rhs_base; const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA; - PacketBlock accZero, acc; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc; - bsetzero(accZero); + bsetzero(accZero0); - Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); + Index remaining_depth = (col + quad_traits::rows < cols) ? depth : (depth & -quad_traits::rows); Index k = 0; - for(; k + PEEL <= remaining_depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - EIGEN_POWER_PREFETCH(lhs_ptr); - for (int l = 0; l < PEEL; l++) { - MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); - } + if (remaining_depth >= PEEL_ROW) { + MICRO_ZERO_PEEL_ROW + do + { + EIGEN_POWER_PREFETCH(rhs_ptr); + EIGEN_POWER_PREFETCH(lhs_ptr); + MICRO_WORK_PEEL_ROW + } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth); + MICRO_ADD_PEEL_ROW } for(; k < remaining_depth; k++) { - MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero, remaining_rows); + MICRO_EXTRA_ROW(lhs_ptr, rhs_ptr, accZero0); } if ((remaining_depth == depth) && (rows >= accCols)) { - for(Index j = 0; j < 4; j++) { - acc.packet[j] = res.template loadPacket(row, col + j); - } - bscale(acc, accZero, pAlpha, pMask); - res.template storePacketBlock(row, col, acc); + bload(acc, res, row, 0); + bscale(acc, accZero0, pAlpha, pMask); + res.template storePacketBlock(row, 0, acc); } else { for(; k < depth; k++) { Packet rhsV[4]; - pbroadcast4(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows); + pbroadcastN(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + pger(&accZero0, lhs_ptr, rhsV); lhs_ptr += remaining_rows; rhs_ptr += accRows; } - for(Index j = 0; j < 4; j++) { - accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]); - } - for(Index j = 0; j < 4; j++) { + for(Index j = 0; j < accRows; j++) { + accZero0.packet[j] = vec_mul(pAlpha, accZero0.packet[j]); for(Index i = 0; i < remaining_rows; i++) { - res(row + i, col + j) += accZero.packet[j][i]; + res(row + i, j) += accZero0.packet[j][i]; } } } } -#define MICRO_UNROLL(func) \ - func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) - -#define MICRO_UNROLL_WORK(func, func2, peel) \ - MICRO_UNROLL(func2); \ - func(0,peel) func(1,peel) func(2,peel) func(3,peel) \ - func(4,peel) func(5,peel) func(6,peel) func(7,peel) - -#define MICRO_LOAD_ONE(iter) \ - if (unroll_factor > iter) { \ - lhsV##iter = ploadLhs(lhs_ptr##iter); \ - lhs_ptr##iter += accCols; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhsV##iter); \ - } - -#define MICRO_WORK_ONE(iter, peel) \ - if (unroll_factor > iter) { \ - pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ +template +EIGEN_ALWAYS_INLINE void gemm_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + switch(remaining_rows) { + case 1: + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); + break; + case 2: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); + } + break; + default: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask); + } + break; } +} -#define MICRO_TYPE_PEEL4(func, func2, peel) \ - if (PEEL > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - pbroadcast4(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ - MICRO_UNROLL_WORK(func, func2, peel) \ +#define MICRO_UNROLL(func) \ + func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7) + +#define MICRO_UNROLL_WORK(func, func2, peel) \ + MICRO_UNROLL(func2); \ + func(0,peel) func(1,peel) func(2,peel) func(3,peel) \ + func(4,peel) func(5,peel) func(6,peel) func(7,peel) + +#define MICRO_LOAD_ONE(iter) \ + if (unroll_factor > iter) { \ + lhsV##iter = ploadLhs(lhs_ptr##iter); \ + lhs_ptr##iter += accCols; \ } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(lhsV##iter); \ } -#define MICRO_TYPE_PEEL1(func, func2, peel) \ +#define MICRO_WORK_ONE(iter, peel) \ + if (unroll_factor > iter) { \ + pger_common(&accZero##iter, lhsV##iter, rhsV##peel); \ + } + +#define MICRO_TYPE_PEEL4(func, func2, peel) \ if (PEEL > peel) { \ Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \ - rhsV##peel[0] = pset1(rhs_ptr[remaining_cols * peel]); \ + pbroadcastN(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ MICRO_UNROLL_WORK(func, func2, peel) \ } else { \ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ } #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \ - Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \ func(func1,func2,0); func(func1,func2,1); \ func(func1,func2,2); func(func1,func2,3); \ func(func1,func2,4); func(func1,func2,5); \ - func(func1,func2,6); func(func1,func2,7); \ - func(func1,func2,8); func(func1,func2,9); + func(func1,func2,6); func(func1,func2,7); #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \ Packet rhsV0[M]; \ @@ -1499,17 +1581,9 @@ EIGEN_STRONG_INLINE void gemm_extra_row( MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ rhs_ptr += accRows; -#define MICRO_ONE_PEEL1 \ - MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += (remaining_cols * PEEL); - -#define MICRO_ONE1 \ - MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \ - rhs_ptr += remaining_cols; - #define MICRO_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzero(accZero##iter); \ + bsetzero(accZero##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accZero##iter); \ } @@ -1518,7 +1592,7 @@ EIGEN_STRONG_INLINE void gemm_extra_row( #define MICRO_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ } @@ -1534,25 +1608,13 @@ EIGEN_STRONG_INLINE void gemm_extra_row( #define MICRO_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ - acc.packet[1] = res.template loadPacket(row + iter*accCols, col + 1); \ - acc.packet[2] = res.template loadPacket(row + iter*accCols, col + 2); \ - acc.packet[3] = res.template loadPacket(row + iter*accCols, col + 3); \ - bscale(acc, accZero##iter, pAlpha); \ - res.template storePacketBlock(row + iter*accCols, col, acc); \ + bload(acc, res, row + iter*accCols, 0); \ + bscale(acc, accZero##iter, pAlpha); \ + res.template storePacketBlock(row + iter*accCols, 0, acc); \ } #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE) -#define MICRO_COL_STORE_ONE(iter) \ - if (unroll_factor > iter) { \ - acc.packet[0] = res.template loadPacket(row + iter*accCols, col + 0); \ - bscale(acc, accZero##iter, pAlpha); \ - res.template storePacketBlock(row + iter*accCols, col, acc); \ - } - -#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE) - template EIGEN_STRONG_INLINE void gemm_unrolled_iteration( const DataMapper& res, @@ -1560,16 +1622,13 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration( const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index& row, - Index col, const Packet& pAlpha) { -asm("#gemm begin"); const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7; - PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; - PacketBlock acc; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; + PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; + PacketBlock acc; MICRO_SRC_PTR MICRO_DST_PTR @@ -1588,104 +1647,102 @@ asm("#gemm begin"); MICRO_STORE row += unroll_factor*accCols; -asm("#gemm end"); } -template -EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration( +template +EIGEN_ALWAYS_INLINE void gemm_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, - Index& row, + Index strideB, + Index offsetB, Index col, - Index remaining_cols, - const Packet& pAlpha) + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) { - const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, *lhs_ptr7; - PacketBlock accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; - PacketBlock acc; - - MICRO_SRC_PTR - MICRO_DST_PTR + const DataMapper res3 = res.getSubMapper(0, col); - Index k = 0; - for(; k + PEEL <= depth; k+= PEEL) - { - EIGEN_POWER_PREFETCH(rhs_ptr); - MICRO_PREFETCH - MICRO_ONE_PEEL1 - } - for(; k < depth; k++) - { - MICRO_ONE1 - } - MICRO_COL_STORE + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; - row += unroll_factor*accCols; -} - -template -EIGEN_STRONG_INLINE void gemm_unrolled_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index& row, - Index rows, - Index col, - Index remaining_cols, - const Packet& pAlpha) -{ #define MAX_UNROLL 6 while(row + MAX_UNROLL*accCols <= rows) { - gemm_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + gemm_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); } switch( (rows-row)/accCols ) { #if MAX_UNROLL > 7 case 7: - gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 6 case 6: - gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 5 - case 5: - gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + case 5: + gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 4 - case 4: - gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); + case 4: + gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); break; #endif #if MAX_UNROLL > 3 - case 3: - gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 3: + gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_UNROLL > 2 - case 2: - gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 2: + gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_UNROLL > 1 - case 1: - gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha); - break; + case 1: + gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif - default: - break; + default: + break; } #undef MAX_UNROLL + + if(remaining_rows > 0) + { + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } +} + +template +EIGEN_STRONG_INLINE void gemm_extra_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) +{ + for (; col < cols; col++) { + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); + } } /**************** @@ -1695,7 +1752,6 @@ template(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - } - switch( (rows-row)/accCols ) { -#if MAX_UNROLL > 7 - case 7: - gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 6 - case 6: - gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 5 - case 5: - gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 4 - case 4: - gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 3 - case 3: - gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 2 - case 2: - gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif -#if MAX_UNROLL > 1 - case 1: - gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; -#endif - default: - break; - } -#undef MAX_UNROLL - - if(remaining_rows > 0) - { - gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); - } - } - - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); - - if (remaining_rows > 0) - { - gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); - } - rhs_base++; + gemm_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } - } + + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } #define accColsC (accCols / 2) @@ -1787,117 +1774,66 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const // PEEL_COMPLEX loop factor. #define PEEL_COMPLEX 3 +#define PEEL_COMPLEX_ROW 3 -template -EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_COL( - const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, - const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, - PacketBlock &accReal, PacketBlock &accImag, - Index remaining_rows, - Index remaining_cols) -{ - Packet rhsV[1], rhsVi[1]; - rhsV[0] = pset1(rhs_ptr_real[0]); - if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); - pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - rhs_ptr_real += remaining_cols; - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); -} - -template -EIGEN_STRONG_INLINE void gemm_complex_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index strideB, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlphaReal, - const Packet& pAlphaImag) -{ - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB; - else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; - const Scalar* lhs_ptr_imag; - if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; - else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - PacketBlock accReal, accImag; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - - bsetzero(accReal); - bsetzero(accImag); +#define MICRO_COMPLEX_UNROLL_PEEL(func) \ + func(0) func(1) func(2) func(3) - Index remaining_depth = (depth & -accRows); - Index k = 0; - for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - EIGEN_POWER_PREFETCH(lhs_ptr_real); - if(!LhsIsReal) { - EIGEN_POWER_PREFETCH(lhs_ptr_imag); - } - for (int l = 0; l < PEEL_COMPLEX; l++) { - MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); - } - } - for(; k < remaining_depth; k++) - { - MICRO_COMPLEX_EXTRA_COL(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols); +#define MICRO_COMPLEX_ZERO_PEEL(peel) \ + if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \ + bsetzero(accReal##peel); \ + bsetzero(accImag##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(accReal##peel); \ + EIGEN_UNUSED_VARIABLE(accImag##peel); \ } - for(; k < depth; k++) - { - Packet rhsV[1], rhsVi[1]; - rhsV[0] = pset1(rhs_ptr_real[0]); - if(!RhsIsReal) rhsVi[0] = pset1(rhs_ptr_imag[0]); - pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); - lhs_ptr_real += remaining_rows; - if(!LhsIsReal) lhs_ptr_imag += remaining_rows; - rhs_ptr_real += remaining_cols; - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; +#define MICRO_COMPLEX_ZERO_PEEL_ROW \ + MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_ZERO_PEEL); + +#define MICRO_COMPLEX_WORK_PEEL(peel) \ + if (PEEL_COMPLEX_ROW > peel) { \ + pbroadcastN_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ + pgerc(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \ + } else { \ + EIGEN_UNUSED_VARIABLE(rhsV##peel); \ + EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); - bcouple_common(taccReal, taccImag, acc0, acc1); +#define MICRO_COMPLEX_WORK_PEEL_ROW \ + Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \ + Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \ + MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_WORK_PEEL); \ + lhs_ptr_real += (remaining_rows * PEEL_COMPLEX_ROW); \ + if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * PEEL_COMPLEX_ROW); \ + else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); \ + rhs_ptr_real += (accRows * PEEL_COMPLEX_ROW); \ + if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_ROW); \ + else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) - { - res(row + 0, col + 0) += pfirst(acc0.packet[0]); - } else { - acc0.packet[0] += res.template loadPacket(row + 0, col + 0); - res.template storePacketBlock(row + 0, col + 0, acc0); - if(remaining_rows > accColsC) { - res(row + accColsC, col + 0) += pfirst(acc1.packet[0]); - } +#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \ + if (PEEL_COMPLEX_ROW > peel) { \ + for (Index i = 0; i < accRows; i++) { \ + accReal##sum.packet[i] += accReal##peel.packet[i]; \ + accImag##sum.packet[i] += accImag##peel.packet[i]; \ + } \ } -} -template -EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_ROW( +#define MICRO_COMPLEX_ADD_PEEL_ROW \ + MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \ + MICRO_COMPLEX_ADD_PEEL(1, 0) + +template +EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW( const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag, const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag, - PacketBlock &accReal, PacketBlock &accImag, - Index remaining_rows) + PacketBlock &accReal, PacketBlock &accImag) { Packet rhsV[4], rhsVi[4]; - pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); - pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); + pbroadcastN_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); lhs_ptr_real += remaining_rows; if(!LhsIsReal) lhs_ptr_imag += remaining_rows; else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); @@ -1906,8 +1842,8 @@ EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_ROW( else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } -template -EIGEN_STRONG_INLINE void gemm_complex_extra_row( +template +EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -1919,108 +1855,141 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( Index col, Index rows, Index cols, - Index remaining_rows, const Packet& pAlphaReal, const Packet& pAlphaImag, const Packet& pMask) { -asm("#gemm_complex begin"); const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; + const Scalar* rhs_ptr_imag = NULL; if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB; else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA; - const Scalar* lhs_ptr_imag; + const Scalar* lhs_ptr_imag = NULL; if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA; else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); - PacketBlock accReal, accImag; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; + PacketBlock accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; - bsetzero(accReal); - bsetzero(accImag); + bsetzero(accReal0); + bsetzero(accImag0); - Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows); + Index remaining_depth = (col + quad_traits::rows < cols) ? depth : (depth & -quad_traits::rows); Index k = 0; - for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - EIGEN_POWER_PREFETCH(lhs_ptr_real); - if(!LhsIsReal) { - EIGEN_POWER_PREFETCH(lhs_ptr_imag); - } - for (int l = 0; l < PEEL_COMPLEX; l++) { - MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); - } + if (remaining_depth >= PEEL_COMPLEX_ROW) { + MICRO_COMPLEX_ZERO_PEEL_ROW + do + { + EIGEN_POWER_PREFETCH(rhs_ptr_real); + if(!RhsIsReal) { + EIGEN_POWER_PREFETCH(rhs_ptr_imag); + } + EIGEN_POWER_PREFETCH(lhs_ptr_real); + if(!LhsIsReal) { + EIGEN_POWER_PREFETCH(lhs_ptr_imag); + } + MICRO_COMPLEX_WORK_PEEL_ROW + } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth); + MICRO_COMPLEX_ADD_PEEL_ROW } for(; k < remaining_depth; k++) { - MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows); + MICRO_COMPLEX_EXTRA_ROW(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal0, accImag0); } if ((remaining_depth == depth) && (rows >= accCols)) { - bload(tRes, res, row, col); - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); - bcouple(taccReal, taccImag, tRes, acc0, acc1); - res.template storePacketBlock(row + 0, col, acc0); - res.template storePacketBlock(row + accColsC, col, acc1); + bload(tRes, res, row, 0); + bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask); + bcouple(taccReal, taccImag, tRes, acc0, acc1); + res.template storePacketBlock(row + 0, 0, acc0); + res.template storePacketBlock(row + accColsC, 0, acc1); } else { for(; k < depth; k++) { Packet rhsV[4], rhsVi[4]; - pbroadcast4_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); - if(!RhsIsReal) pbroadcast4_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); - pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows); + pbroadcastN_old(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]); + if(!RhsIsReal) pbroadcastN_old(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]); + pgerc(&accReal0, &accImag0, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi); lhs_ptr_real += remaining_rows; if(!LhsIsReal) lhs_ptr_imag += remaining_rows; rhs_ptr_real += accRows; if(!RhsIsReal) rhs_ptr_imag += accRows; } - bscalec(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag); - bcouple_common(taccReal, taccImag, acc0, acc1); + bscalec(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag); + bcouple_common(taccReal, taccImag, acc0, acc1); if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) { - for(Index j = 0; j < 4; j++) { - res(row + 0, col + j) += pfirst(acc0.packet[j]); + for(Index j = 0; j < accRows; j++) { + res(row + 0, j) += pfirst(acc0.packet[j]); } } else { - for(Index j = 0; j < 4; j++) { + for(Index j = 0; j < accRows; j++) { PacketBlock acc2; - acc2.packet[0] = res.template loadPacket(row + 0, col + j) + acc0.packet[j]; - res.template storePacketBlock(row + 0, col + j, acc2); + acc2.packet[0] = res.template loadPacket(row + 0, j) + acc0.packet[j]; + res.template storePacketBlock(row + 0, j, acc2); if(remaining_rows > accColsC) { - res(row + accColsC, col + j) += pfirst(acc1.packet[j]); + res(row + accColsC, j) += pfirst(acc1.packet[j]); } } } } -asm("#gemm_complex end"); +} + +template +EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( + const DataMapper& res, + const Scalar* lhs_base, + const Scalar* rhs_base, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index row, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + switch(remaining_rows) { + case 1: + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); + break; + case 2: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); + } + break; + default: + if (sizeof(Scalar) == sizeof(float)) { + gemm_unrolled_complex_row_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask); + } + break; + } } #define MICRO_COMPLEX_UNROLL(func) \ - func(0) func(1) func(2) func(3) func(4) + func(0) func(1) func(2) func(3) #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ MICRO_COMPLEX_UNROLL(func2); \ - func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel) + func(0,peel) func(1,peel) func(2,peel) func(3,peel) #define MICRO_COMPLEX_LOAD_ONE(iter) \ if (unroll_factor > iter) { \ lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ - lhs_ptr_real##iter += accCols; \ if(!LhsIsReal) { \ - lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ - lhs_ptr_imag##iter += accCols; \ + lhsVi##iter = ploadLhs(lhs_ptr_real##iter + imag_delta); \ } else { \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ } \ + lhs_ptr_real##iter += accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhsV##iter); \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ @@ -2028,37 +1997,16 @@ asm("#gemm_complex end"); #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \ if (unroll_factor > iter) { \ - pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ - } - -#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \ - if (unroll_factor > iter) { \ - pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ + pgerc_common(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \ } #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \ if (PEEL_COMPLEX > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ - pbroadcast4_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ + Packet lhsV0, lhsV1, lhsV2, lhsV3; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ + pbroadcastN_old(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \ if(!RhsIsReal) { \ - pbroadcast4_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ - } \ - MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \ - } else { \ - EIGEN_UNUSED_VARIABLE(rhsV##peel); \ - EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ - } - -#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \ - if (PEEL_COMPLEX > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ - rhsV##peel[0] = pset1(rhs_ptr_real[remaining_cols * peel]); \ - if(!RhsIsReal) { \ - rhsVi##peel[0] = pset1(rhs_ptr_imag[remaining_cols * peel]); \ + pbroadcastN_old(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \ } else { \ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } \ @@ -2069,13 +2017,10 @@ asm("#gemm_complex end"); } #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \ - Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \ - Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \ + Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \ + Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \ func(func1,func2,0); func(func1,func2,1); \ - func(func1,func2,2); func(func1,func2,3); \ - func(func1,func2,4); func(func1,func2,5); \ - func(func1,func2,6); func(func1,func2,7); \ - func(func1,func2,8); func(func1,func2,9); + func(func1,func2,2); func(func1,func2,3); #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \ Packet rhsV0[M], rhsVi0[M];\ @@ -2091,20 +2036,10 @@ asm("#gemm_complex end"); rhs_ptr_real += accRows; \ if(!RhsIsReal) rhs_ptr_imag += accRows; -#define MICRO_COMPLEX_ONE_PEEL1 \ - MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \ - if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX); - -#define MICRO_COMPLEX_ONE1 \ - MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \ - rhs_ptr_real += remaining_cols; \ - if(!RhsIsReal) rhs_ptr_imag += remaining_cols; - #define MICRO_COMPLEX_DST_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - bsetzero(accReal##iter); \ - bsetzero(accImag##iter); \ + bsetzero(accReal##iter); \ + bsetzero(accImag##iter); \ } else { \ EIGEN_UNUSED_VARIABLE(accReal##iter); \ EIGEN_UNUSED_VARIABLE(accImag##iter); \ @@ -2114,15 +2049,9 @@ asm("#gemm_complex end"); #define MICRO_COMPLEX_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ - if(!LhsIsReal) { \ - lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ } #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE) @@ -2130,35 +2059,21 @@ asm("#gemm_complex end"); #define MICRO_COMPLEX_PREFETCH_ONE(iter) \ if (unroll_factor > iter) { \ EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ - if(!LhsIsReal) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ - } \ } #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE) #define MICRO_COMPLEX_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - bload(tRes, res, row + iter*accCols, col); \ - bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ - bcouple(taccReal, taccImag, tRes, acc0, acc1); \ - res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ - res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ + bload(tRes, res, row + iter*accCols, 0); \ + bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ + bcouple(taccReal, taccImag, tRes, acc0, acc1); \ + res.template storePacketBlock(row + iter*accCols + 0, 0, acc0); \ + res.template storePacketBlock(row + iter*accCols + accColsC, 0, acc1); \ } #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE) -#define MICRO_COMPLEX_COL_STORE_ONE(iter) \ - if (unroll_factor > iter) { \ - bload(tRes, res, row + iter*accCols, col); \ - bscalec(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \ - bcouple(taccReal, taccImag, tRes, acc0, acc1); \ - res.template storePacketBlock(row + iter*accCols + 0, col, acc0); \ - res.template storePacketBlock(row + iter*accCols + accColsC, col, acc1); \ - } - -#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE) - template EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( const DataMapper& res, @@ -2166,30 +2081,26 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration( const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index strideB, Index& row, - Index col, const Packet& pAlphaReal, const Packet& pAlphaImag) { -asm("#gemm_complex_unrolled begin"); const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; + const Scalar* rhs_ptr_imag = NULL; + const Index imag_delta = accCols*strideA; if(!RhsIsReal) { rhs_ptr_imag = rhs_base + accRows*strideB; } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1; - const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3; - const Scalar* lhs_ptr_real4, * lhs_ptr_imag4; - PacketBlock accReal0, accImag0, accReal1, accImag1; - PacketBlock accReal2, accImag2, accReal3, accImag3; - PacketBlock accReal4, accImag4; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; + PacketBlock accReal0, accImag0, accReal1, accImag1; + PacketBlock accReal2, accImag2, accReal3, accImag3; + PacketBlock taccReal, taccImag; + PacketBlock acc0, acc1; + PacketBlock tRes; MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_DST_PTR @@ -2211,115 +2122,95 @@ asm("#gemm_complex_unrolled begin"); MICRO_COMPLEX_STORE row += unroll_factor*accCols; -asm("#gemm_complex_unrolled end"); } -template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration( +template +EIGEN_ALWAYS_INLINE void gemm_complex_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index& row, + Index offsetB, Index col, - Index remaining_cols, + Index rows, + Index cols, + Index remaining_rows, const Packet& pAlphaReal, - const Packet& pAlphaImag) + const Packet& pAlphaImag, + const Packet& pMask) { - const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; - if(!RhsIsReal) { - rhs_ptr_imag = rhs_base + remaining_cols*strideB; - } else { - EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); - } - const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1; - const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3; - const Scalar* lhs_ptr_real4, * lhs_ptr_imag4; - PacketBlock accReal0, accImag0, accReal1, accImag1; - PacketBlock accReal2, accImag2, accReal3, accImag3; - PacketBlock accReal4, accImag4; - PacketBlock taccReal, taccImag; - PacketBlock acc0, acc1; - PacketBlock tRes; + const DataMapper res3 = res.getSubMapper(0, col); - MICRO_COMPLEX_SRC_PTR - MICRO_COMPLEX_DST_PTR + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; - Index k = 0; - for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX) - { - EIGEN_POWER_PREFETCH(rhs_ptr_real); - if(!RhsIsReal) { - EIGEN_POWER_PREFETCH(rhs_ptr_imag); - } - MICRO_COMPLEX_PREFETCH - MICRO_COMPLEX_ONE_PEEL1 +#define MAX_COMPLEX_UNROLL 3 + while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { + gemm_complex_unrolled_iteration(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); } - for(; k < depth; k++) - { - MICRO_COMPLEX_ONE1 + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_UNROLL > 4 + case 4: + gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 3 + case 3: + gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 2 + case 2: + gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_UNROLL > 1 + case 1: + gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; } - MICRO_COMPLEX_COL_STORE +#undef MAX_COMPLEX_UNROLL - row += unroll_factor*accCols; + if(remaining_rows > 0) + { + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } } template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( +EIGEN_STRONG_INLINE void gemm_complex_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index& row, - Index rows, + Index offsetB, Index col, - Index remaining_cols, + Index rows, + Index cols, + Index remaining_rows, const Packet& pAlphaReal, - const Packet& pAlphaImag) + const Packet& pAlphaImag, + const Packet& pMask) { -#define MAX_COMPLEX_UNROLL 3 - while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { - gemm_complex_unrolled_col_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); + for (; col < cols; col++) { + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_UNROLL > 4 - case 4: - gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 3 - case 3: - gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 2 - case 2: - gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 1 - case 1: - gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; - } -#undef MAX_COMPLEX_UNROLL } template EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; @@ -2334,64 +2225,10 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - Index row = 0; - -#define MAX_COMPLEX_UNROLL 3 - while(row + MAX_COMPLEX_UNROLL*accCols <= rows) { - gemm_complex_unrolled_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_UNROLL > 4 - case 4: - gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 3 - case 3: - gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 2 - case 2: - gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_UNROLL > 1 - case 1: - gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; - } -#undef MAX_COMPLEX_UNROLL - - if(remaining_rows > 0) - { - gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); - } + gemm_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); - - if (remaining_rows > 0) - { - gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); - } - rhs_base++; - } - } + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } #undef accColsC @@ -2429,6 +2266,7 @@ void gemm_pack_lhs struct gemm_pack_rhs { @@ -2456,6 +2294,7 @@ void gemm_pack_rhs pack; pack(blockB, rhs, depth, cols, stride, offset); } +#endif template struct gemm_pack_lhs @@ -2484,6 +2323,7 @@ void gemm_pack_lhs pack; pack(blockA, lhs, depth, rows, stride, offset); } + template struct gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> { @@ -2512,6 +2352,7 @@ void gemm_pack_lhs, Index, DataMapper, Pack1, Pack2, Packet, pack(blockA, lhs, depth, rows, stride, offset); } +#if EIGEN_ALTIVEC_USE_CUSTOM_PACK template struct gemm_pack_rhs { @@ -2539,6 +2380,7 @@ void gemm_pack_rhs dhs_pack pack; pack(blockB, rhs, depth, cols, stride, offset); } +#endif template struct gemm_pack_rhs, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> @@ -2646,10 +2488,10 @@ void gebp_kernel::size; void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY + #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only gemm_function = &Eigen::internal::gemmMMA; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemmMMA; } @@ -2659,7 +2501,7 @@ void gebp_kernel; #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2685,20 +2527,20 @@ void gebp_kernel, std::complex, Index, DataMapper, mr void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2723,20 +2565,20 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjugat const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const float*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2761,20 +2603,20 @@ void gebp_kernel, float, Index, DataMapper, mr, nr, Conjugat const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const float*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, float, std::complex, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2798,10 +2640,10 @@ void gebp_kernel::size; void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY + #if defined(EIGEN_ALTIVEC_MMA_ONLY) //generate with MMA only gemm_function = &Eigen::internal::gemmMMA; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ gemm_function = &Eigen::internal::gemmMMA; } @@ -2811,7 +2653,7 @@ void gebp_kernel; #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2836,20 +2678,20 @@ void gebp_kernel, std::complex, Index, DataMapper, const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2874,20 +2716,20 @@ void gebp_kernel, double, Index, DataMapper, mr, nr, Conjug const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const std::complex*, const double*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, double, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } template @@ -2912,20 +2754,20 @@ void gebp_kernel, Index, DataMapper, mr, nr, Conjug const Index accCols = quad_traits::size; void (*gemm_function)(const DataMapper&, const double*, const std::complex*, Index, Index, Index, std::complex, Index, Index, Index, Index); - #ifdef EIGEN_ALTIVEC_MMA_ONLY - //generate with MMA only - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA) - if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ - gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - else{ - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - } - #else - gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; - #endif - gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); + #if defined(EIGEN_ALTIVEC_MMA_ONLY) + //generate with MMA only + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) + if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){ + gemm_function = &Eigen::internal::gemm_complexMMA, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + else{ + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + } + #else + gemm_function = &Eigen::internal::gemm_complex, std::complex, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>; + #endif + gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB); } } // end namespace internal diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h index 6e74116b9e97e1fab00bc261dd676813f50f4dcc..bf01dba1cadd1d6be00108634fd5478c15eecd6b 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h @@ -9,22 +9,8 @@ namespace Eigen { namespace internal { -template -EIGEN_STRONG_INLINE void gemm_extra_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index row, - Index col, - Index remaining_rows, - Index remaining_cols, - const Packet& pAlpha); - template -EIGEN_STRONG_INLINE void gemm_extra_row( +EIGEN_ALWAYS_INLINE void gemm_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -39,41 +25,28 @@ EIGEN_STRONG_INLINE void gemm_extra_row( const Packet& pAlpha, const Packet& pMask); -template -EIGEN_STRONG_INLINE void gemm_unrolled_col( - const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, - Index depth, - Index strideA, - Index offsetA, - Index& row, - Index rows, - Index col, - Index remaining_cols, - const Packet& pAlpha); - -template -EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows); - -template -EIGEN_STRONG_INLINE void gemm_complex_extra_col( +template +EIGEN_STRONG_INLINE void gemm_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index row, + Index offsetB, Index col, + Index rows, + Index cols, Index remaining_rows, - Index remaining_cols, - const Packet& pAlphaReal, - const Packet& pAlphaImag); + const Packet& pAlpha, + const Packet& pMask); + +template +EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows); template -EIGEN_STRONG_INLINE void gemm_complex_extra_row( +EIGEN_ALWAYS_INLINE void gemm_complex_extra_row( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, @@ -91,130 +64,95 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row( const Packet& pMask); template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_col( +EIGEN_STRONG_INLINE void gemm_complex_extra_cols( const DataMapper& res, - const Scalar* lhs_base, - const Scalar* rhs_base, + const Scalar* blockA, + const Scalar* blockB, Index depth, Index strideA, Index offsetA, Index strideB, - Index& row, - Index rows, + Index offsetB, Index col, - Index remaining_cols, + Index rows, + Index cols, + Index remaining_rows, const Packet& pAlphaReal, - const Packet& pAlphaImag); + const Packet& pAlphaImag, + const Packet& pMask); template -EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs); +EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs); -template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); - -template -EIGEN_STRONG_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); - -template -EIGEN_STRONG_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); +template +EIGEN_ALWAYS_INLINE void bload(PacketBlock& acc, const DataMapper& res, Index row, Index col); template -EIGEN_STRONG_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); - -const static Packet16uc p16uc_SETCOMPLEX32_FIRST = { 0, 1, 2, 3, - 16, 17, 18, 19, - 4, 5, 6, 7, - 20, 21, 22, 23}; - -const static Packet16uc p16uc_SETCOMPLEX32_SECOND = { 8, 9, 10, 11, - 24, 25, 26, 27, - 12, 13, 14, 15, - 28, 29, 30, 31}; -//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64 -const static Packet16uc p16uc_SETCOMPLEX64_FIRST = { 0, 1, 2, 3, 4, 5, 6, 7, - 16, 17, 18, 19, 20, 21, 22, 23}; - -//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64 -const static Packet16uc p16uc_SETCOMPLEX64_SECOND = { 8, 9, 10, 11, 12, 13, 14, 15, - 24, 25, 26, 27, 28, 29, 30, 31}; +EIGEN_ALWAYS_INLINE void bscale(PacketBlock& acc, PacketBlock& accZ, const Packet& pAlpha); +template +EIGEN_ALWAYS_INLINE void bscalec(PacketBlock& aReal, PacketBlock& aImag, const Packet& bReal, const Packet& bImag, PacketBlock& cReal, PacketBlock& cImag); // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks. -template -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST); - acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND); - acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND); -} - -template -EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) -{ - bcouple_common(taccReal, taccImag, acc1, acc2); - - acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); - acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); - acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); - acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); - - acc2.packet[0] = padd(tRes.packet[4], acc2.packet[0]); - acc2.packet[1] = padd(tRes.packet[5], acc2.packet[1]); - acc2.packet[2] = padd(tRes.packet[6], acc2.packet[2]); - acc2.packet[3] = padd(tRes.packet[7], acc2.packet[3]); -} - -template -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) +template +EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) { - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND); + acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]); + if (N > 1) { + acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]); + } + if (N > 2) { + acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]); + } + if (N > 3) { + acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]); + } + + acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]); + if (N > 1) { + acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]); + } + if (N > 2) { + acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]); + } + if (N > 3) { + acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]); + } } -template -EIGEN_STRONG_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) +template +EIGEN_ALWAYS_INLINE void bcouple(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& tRes, PacketBlock& acc1, PacketBlock& acc2) { - bcouple_common(taccReal, taccImag, acc1, acc2); + bcouple_common(taccReal, taccImag, acc1, acc2); acc1.packet[0] = padd(tRes.packet[0], acc1.packet[0]); - - acc2.packet[0] = padd(tRes.packet[1], acc2.packet[0]); -} - -template<> -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST); - acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND); - acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND); -} - -template<> -EIGEN_STRONG_INLINE void bcouple_common(PacketBlock& taccReal, PacketBlock& taccImag, PacketBlock& acc1, PacketBlock& acc2) -{ - acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST); - - acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND); + if (N > 1) { + acc1.packet[1] = padd(tRes.packet[1], acc1.packet[1]); + } + if (N > 2) { + acc1.packet[2] = padd(tRes.packet[2], acc1.packet[2]); + } + if (N > 3) { + acc1.packet[3] = padd(tRes.packet[3], acc1.packet[3]); + } + + acc2.packet[0] = padd(tRes.packet[0+N], acc2.packet[0]); + if (N > 1) { + acc2.packet[1] = padd(tRes.packet[1+N], acc2.packet[1]); + } + if (N > 2) { + acc2.packet[2] = padd(tRes.packet[2+N], acc2.packet[2]); + } + if (N > 3) { + acc2.packet[3] = padd(tRes.packet[3+N], acc2.packet[3]); + } } // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. template -EIGEN_STRONG_INLINE Packet ploadRhs(const Scalar* rhs) +EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs) { - return *reinterpret_cast(const_cast(rhs)); + return ploadu(rhs); } } // end namespace internal diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h index 8edf79c4b7c3404aee6eacafa1bda23d55dcd962..7dda42339ee8e51137ac8e9d7301411666de9155 100644 --- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h +++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h @@ -11,7 +11,11 @@ #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H -#pragma GCC target("cpu=power10") +// If using dynamic dispatch, set the CPU target. +#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) +#pragma GCC push_options +#pragma GCC target("cpu=power10,htm") +#endif #ifdef __has_builtin #if !__has_builtin(__builtin_vsx_assemble_pair) @@ -24,48 +28,48 @@ namespace Eigen { namespace internal { template -EIGEN_STRONG_INLINE void bsetzeroMMA(__vector_quad* acc) +EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) { __builtin_mma_xxsetaccz(acc); } template -EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc) +EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc) { PacketBlock result; __builtin_mma_disassemble_acc(&result.packet, acc); PacketBlock tRes; - bload(tRes, data, i, j); + bload(tRes, data, i, 0); - bscale(tRes, result, alpha); + bscale(tRes, result, alpha); - data.template storePacketBlock(i, j, tRes); + data.template storePacketBlock(i, 0, tRes); } -template -EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) +template +EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag) { PacketBlock resultReal, resultImag; __builtin_mma_disassemble_acc(&resultReal.packet, accReal); __builtin_mma_disassemble_acc(&resultImag.packet, accImag); PacketBlock tRes; - bload(tRes, data, i, j); + bload(tRes, data, i, 0); PacketBlock taccReal, taccImag; bscalec(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag); PacketBlock acc1, acc2; - bcouple(taccReal, taccImag, tRes, acc1, acc2); + bcouple(taccReal, taccImag, tRes, acc1, acc2); - data.template storePacketBlock(i + N*accColsC, j, acc1); - data.template storePacketBlock(i + (N+1)*accColsC, j, acc2); + data.template storePacketBlock(i, 0, acc1); + data.template storePacketBlock(i + accColsC, 0, acc2); } // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) { if(NegativeAccumulate) { @@ -76,7 +80,7 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const L } template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock& a, const Packet2d& b) { __vector_pair* a0 = (__vector_pair *)(&a.packet[0]); if(NegativeAccumulate) @@ -88,7 +92,7 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) { if(NegativeAccumulate) { @@ -99,15 +103,13 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, con } template -EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet4f& b) +EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&) { - EIGEN_UNUSED_VARIABLE(acc); // Just for compilation - EIGEN_UNUSED_VARIABLE(a); - EIGEN_UNUSED_VARIABLE(b); + // Just for compilation } template -EIGEN_STRONG_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi) +EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi) { pgerMMA(accReal, rhsV, lhsV); if(LhsIsReal) { @@ -125,20 +127,20 @@ EIGEN_STRONG_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled. template -EIGEN_STRONG_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) { - rhsV = ploadRhs((const Scalar*)(rhs)); + rhsV = ploadRhs(rhs); } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA >(const double* rhs, PacketBlock& rhsV) { rhsV.packet[0] = ploadRhs((const double *)((Packet2d *)rhs )); rhsV.packet[1] = ploadRhs((const double *)(((Packet2d *)rhs) + 1)); } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) { #if EIGEN_COMP_LLVM __builtin_vsx_assemble_pair(&rhsV, @@ -150,11 +152,9 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const double* rhs, _ } template<> -EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV) +EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&) { // Just for compilation - EIGEN_UNUSED_VARIABLE(rhs); - EIGEN_UNUSED_VARIABLE(rhsV); } // PEEL_MMA loop factor. @@ -188,12 +188,11 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV) } #define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ + type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \ MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \ MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \ MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \ - MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9); + MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); #define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \ type rhsV0; \ @@ -226,7 +225,7 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV) #define MICRO_MMA_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \ + lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \ } @@ -242,26 +241,23 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV) #define MICRO_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeAccumulator(row + iter*accCols, col, res, pAlpha, &accZero##iter); \ + storeAccumulator(row + iter*accCols, res, pAlpha, &accZero##iter); \ } #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE) template -EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration( +EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index& row, - Index col, const Packet& pAlpha) { -asm("#gemm_MMA begin"); const Scalar* rhs_ptr = rhs_base; - const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7; + const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL; __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7; MICRO_MMA_SRC_PTR @@ -281,97 +277,100 @@ asm("#gemm_MMA begin"); MICRO_MMA_STORE row += unroll_factor*accCols; -asm("#gemm_MMA end"); } -template -void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +template +EIGEN_ALWAYS_INLINE void gemmMMA_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlpha, + const Packet& pMask) { - const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; + const DataMapper res3 = res.getSubMapper(0, col); - if( strideA == -1 ) strideA = depth; - if( strideB == -1 ) strideB = depth; - - const Packet pAlpha = pset1(alpha); - const Packet pMask = bmask((const int)(remaining_rows)); - - Index col = 0; - for(; col + accRows <= cols; col += accRows) - { - const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; + const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; - Index row = 0; #define MAX_MMA_UNROLL 7 - while(row + MAX_MMA_UNROLL*accCols <= rows) { - gemm_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - } - switch( (rows-row)/accCols ) { + while(row + MAX_MMA_UNROLL*accCols <= rows) { + gemm_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + } + switch( (rows-row)/accCols ) { #if MAX_MMA_UNROLL > 7 - case 7: - gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 7: + gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 6 - case 6: - gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 6: + gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 5 - case 5: - gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 5: + gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 4 - case 4: - gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 4: + gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 3 - case 3: - gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 3: + gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 2 - case 2: - gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 2: + gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif #if MAX_MMA_UNROLL > 1 - case 1: - gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha); - break; + case 1: + gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha); + break; #endif - default: - break; - } + default: + break; + } #undef MAX_MMA_UNROLL - if(remaining_rows > 0) - { - gemm_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); - } - } + if(remaining_rows > 0) + { + gemm_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask); + } +} - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; +template +void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) +{ + const Index remaining_rows = rows % accCols; - for(; col < cols; col++) - { - Index row = 0; + if( strideA == -1 ) strideA = depth; + if( strideB == -1 ) strideB = depth; - gemm_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha); + const Packet pAlpha = pset1(alpha); + const Packet pMask = bmask((const int)(remaining_rows)); - if (remaining_rows > 0) - { - gemm_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha); - } - rhs_base++; - } + Index col = 0; + for(; col + accRows <= cols; col += accRows) + { + gemmMMA_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } + + gemm_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask); } #define accColsC (accCols / 2) @@ -379,21 +378,20 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define advanceCols ((RhsIsReal) ? 1 : 2) // PEEL_COMPLEX_MMA loop factor. -#define PEEL_COMPLEX_MMA 7 +#define PEEL_COMPLEX_MMA 3 #define MICRO_COMPLEX_MMA_UNROLL(func) \ - func(0) func(1) func(2) func(3) func(4) + func(0) func(1) func(2) func(3) #define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \ if (unroll_factor > iter) { \ lhsV##iter = ploadLhs(lhs_ptr_real##iter); \ - lhs_ptr_real##iter += accCols; \ if(!LhsIsReal) { \ - lhsVi##iter = ploadLhs(lhs_ptr_imag##iter); \ - lhs_ptr_imag##iter += accCols; \ + lhsVi##iter = ploadLhs(lhs_ptr_real##iter + imag_delta); \ } else { \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ } \ + lhs_ptr_real##iter += accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhsV##iter); \ EIGEN_UNUSED_VARIABLE(lhsVi##iter); \ @@ -406,8 +404,8 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \ if (PEEL_COMPLEX_MMA > peel) { \ - Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \ - Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \ + Packet lhsV0, lhsV1, lhsV2, lhsV3; \ + Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \ ploadRhsMMA(rhs_ptr_real + (accRows * peel), rhsV##peel); \ if(!RhsIsReal) { \ ploadRhsMMA(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \ @@ -415,20 +413,17 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } \ MICRO_COMPLEX_MMA_UNROLL(func2); \ - func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \ + func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \ } else { \ EIGEN_UNUSED_VARIABLE(rhsV##peel); \ EIGEN_UNUSED_VARIABLE(rhsVi##peel); \ } #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \ - type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \ - type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \ + type rhsV0, rhsV1, rhsV2, rhsV3; \ + type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \ MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \ - MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9); + MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \ type rhsV0, rhsVi0; \ @@ -465,15 +460,9 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \ if (unroll_factor > iter) { \ - lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \ - if(!LhsIsReal) { \ - lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \ - } else { \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ - } \ + lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \ } else { \ EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \ - EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \ } #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE) @@ -481,46 +470,40 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, #define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \ if (unroll_factor > iter) { \ EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \ - if(!LhsIsReal) { \ - EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \ - } \ } #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE) #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \ if (unroll_factor > iter) { \ - storeComplexAccumulator(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ + storeComplexAccumulator(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \ } #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE) template -EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration( +EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration( const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base, Index depth, Index strideA, - Index offsetA, Index strideB, Index& row, - Index col, const Packet& pAlphaReal, const Packet& pAlphaImag) { -asm("#gemm_complex_MMA begin"); const Scalar* rhs_ptr_real = rhs_base; - const Scalar* rhs_ptr_imag; + const Scalar* rhs_ptr_imag = NULL; + const Index imag_delta = accCols*strideA; if(!RhsIsReal) { rhs_ptr_imag = rhs_base + accRows*strideB; } else { EIGEN_UNUSED_VARIABLE(rhs_ptr_imag); } - const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1; - const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3; - const Scalar* lhs_ptr_real4, * lhs_ptr_imag4; - __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4; + const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL; + const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL; + __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3; MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_DST_PTR @@ -542,14 +525,72 @@ asm("#gemm_complex_MMA begin"); MICRO_COMPLEX_MMA_STORE row += unroll_factor*accCols; -asm("#gemm_complex_MMA end"); +} + +template +EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols( + const DataMapper& res, + const Scalar* blockA, + const Scalar* blockB, + Index depth, + Index strideA, + Index offsetA, + Index strideB, + Index offsetB, + Index col, + Index rows, + Index cols, + Index remaining_rows, + const Packet& pAlphaReal, + const Packet& pAlphaImag, + const Packet& pMask) +{ + const DataMapper res3 = res.getSubMapper(0, col); + + const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; + const Scalar* lhs_base = blockA + accCols*offsetA; + Index row = 0; + +#define MAX_COMPLEX_MMA_UNROLL 4 + while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { + gemm_complex_unrolled_MMA_iteration(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + } + switch( (rows-row)/accCols ) { +#if MAX_COMPLEX_MMA_UNROLL > 4 + case 4: + gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 3 + case 3: + gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 2 + case 2: + gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif +#if MAX_COMPLEX_MMA_UNROLL > 1 + case 1: + gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag); + break; +#endif + default: + break; + } +#undef MAX_COMPLEX_MMA_UNROLL + + if(remaining_rows > 0) + { + gemm_complex_extra_row(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); + } } template void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) { const Index remaining_rows = rows % accCols; - const Index remaining_cols = cols % accRows; if( strideA == -1 ) strideA = depth; if( strideB == -1 ) strideB = depth; @@ -564,74 +605,23 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS Index col = 0; for(; col + accRows <= cols; col += accRows) { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB; - const Scalar* lhs_base = blockA; - Index row = 0; - -#define MAX_COMPLEX_MMA_UNROLL 4 - while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) { - gemm_complex_unrolled_MMA_iteration(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - } - switch( (rows-row)/accCols ) { -#if MAX_COMPLEX_MMA_UNROLL > 4 - case 4: - gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 3 - case 3: - gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 2 - case 2: - gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif -#if MAX_COMPLEX_MMA_UNROLL > 1 - case 1: - gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag); - break; -#endif - default: - break; - } -#undef MAX_COMPLEX_MMA_UNROLL - - if(remaining_rows > 0) - { - gemm_complex_extra_row(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); - } + gemmMMA_complex_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } - if(remaining_cols > 0) - { - const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB; - const Scalar* lhs_base = blockA; - - for(; col < cols; col++) - { - Index row = 0; - - gemm_complex_unrolled_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag); - - if (remaining_rows > 0) - { - gemm_complex_extra_col(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag); - } - rhs_base++; - } - } + gemm_complex_extra_cols(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask); } #undef accColsC #undef advanceRows #undef advanceCols -#pragma GCC reset_options } // end namespace internal } // end namespace Eigen +#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) +#pragma GCC pop_options +#endif + #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h index 7c70c07b1c86a269a1cacfa70b2fad8b2fb6fd4d..528f995d3535b0216adae4df5de0a3cbfa591517 100755 --- a/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -22,10 +22,6 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 @@ -437,7 +433,7 @@ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from) EIGEN_UNUSED_VARIABLE(from); EIGEN_DEBUG_ALIGNED_LOAD #ifdef __VSX__ - return vec_xl(0, from); + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #else return vec_ld(0, from); #endif @@ -790,8 +786,22 @@ template<> EIGEN_STRONG_INLINE Packet8us psub (const Packet8us& a, template<> EIGEN_STRONG_INLINE Packet16c psub (const Packet16c& a, const Packet16c& b) { return a - b; } template<> EIGEN_STRONG_INLINE Packet16uc psub(const Packet16uc& a, const Packet16uc& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } -template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return vec_xor(a, p4f_MZERO); +#endif +} +template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return p4i_ZERO - a; +#endif +} template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } @@ -869,19 +879,31 @@ template<> EIGEN_STRONG_INLINE Packet16c pmax(const Packet16c& a, con template<> EIGEN_STRONG_INLINE Packet16uc pmax(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmple(a,b)); } +// To fix bug with vec_cmplt on older versions +#if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmplt(a,b)); } +#endif template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast(vec_cmpeq(a,b)); } -template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmpeq(a,b)); } -template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmpeq(a,b)); } - -template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmpeq(a,b)); } -template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmpeq(a,b)); } - template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) { Packet4f c = reinterpret_cast(vec_cmpge(a,b)); return vec_nor(c,c); } + +template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmplt(a,b)); } template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast(vec_cmpeq(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmple(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmplt(a,b)); } +template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast(vec_cmpeq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } template<> EIGEN_STRONG_INLINE Packet4i pand(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } @@ -906,8 +928,8 @@ template<> EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a, con return pxor(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } -template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } +template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); } +template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) { return vec_sel(b, a, reinterpret_cast(mask)); @@ -956,7 +978,7 @@ template EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPAC return (Packet) vec_perm(MSQ, LSQ, mask); // align the data #else EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl(0, from); + return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from)); #endif } @@ -1264,15 +1286,15 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){ Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp); Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast(p4i_ZERO)); - Packet4bi is_mant_not_zero = vec_cmpne(mantissa, reinterpret_cast(p4i_ZERO)); - Packet4ui nan_selector = pand( + Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast(p4i_ZERO)); + Packet4ui nan_selector = pandnot( reinterpret_cast(is_max_exp), - reinterpret_cast(is_mant_not_zero) + reinterpret_cast(is_mant_zero) ); - Packet4ui subnormal_selector = pand( + Packet4ui subnormal_selector = pandnot( reinterpret_cast(is_zero_exp), - reinterpret_cast(is_mant_not_zero) + reinterpret_cast(is_mant_zero) ); const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000); @@ -1336,16 +1358,6 @@ template<> EIGEN_STRONG_INLINE Packet8bf psub(const Packet8bf& a, con BF16_TO_F32_BINARY_OP_WRAPPER(psub, a, b); } -template<> EIGEN_STRONG_INLINE Packet8bf psqrt (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a); -} -template<> EIGEN_STRONG_INLINE Packet8bf prsqrt (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt, a); -} -template<> EIGEN_STRONG_INLINE Packet8bf pexp (const Packet8bf& a){ - BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a); -} - template<> EIGEN_STRONG_INLINE Packet4f pldexp(const Packet4f& a, const Packet4f& exponent) { return pldexp_generic(a,exponent); } @@ -1411,6 +1423,9 @@ template<> EIGEN_STRONG_INLINE Packet8bf pmax(const Packet8bf& a, con template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) { BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt, a, b); } +template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) { + BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan, a, b); +} template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) { BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le, a, b); } @@ -2260,7 +2275,8 @@ static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull }; static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull }; static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); -static Packet2d p2d_MZERO = { -0.0, -0.0 }; +static Packet2d p2d_MZERO = { numext::bit_cast(0x8000000000000000ull), + numext::bit_cast(0x8000000000000000ull) }; #ifdef _BIG_ENDIAN static Packet2d p2d_COUNTDOWN = reinterpret_cast(vec_sld(reinterpret_cast(p2d_ZERO), reinterpret_cast(p2d_ONE), 8)); @@ -2295,7 +2311,11 @@ template<> struct packet_traits : default_packet_traits HasLog = 0, HasExp = 1, HasSqrt = 1, +#if !EIGEN_COMP_CLANG HasRsqrt = 1, +#else + HasRsqrt = 0, +#endif HasRound = 1, HasFloor = 1, HasCeil = 1, @@ -2384,7 +2404,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const Packet2d& b) { return a - b; } -template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } +template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) +{ +#ifdef __POWER8_VECTOR__ + return vec_neg(a); +#else + return vec_xor(a, p2d_MZERO); +#endif +} template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } @@ -2453,7 +2480,7 @@ template<> EIGEN_STRONG_INLINE Packet2d print(const Packet2d& a) template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD - return vec_xl(0, from); + return vec_xl(0, const_cast(from)); } template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h index b1618e56784bd3586e2bb4475a2adc09cc0f8687..45f6ddb949fd224b26a2c1ce514af590e8dffb38 100644 --- a/Eigen/src/Core/arch/CUDA/Complex.h +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -11,13 +11,24 @@ #ifndef EIGEN_COMPLEX_CUDA_H #define EIGEN_COMPLEX_CUDA_H -// clang-format off // Many std::complex methods such as operator+, operator-, operator* and // operator/ are not constexpr. Due to this, GCC and older versions of clang do // not treat them as device functions and thus Eigen functors making use of // these operators fail to compile. Here, we manually specialize these // operators and functors for complex types when building for CUDA to enable // their use on-device. +// +// NOTES: +// - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device, +// since they are already specialized in the standard. Using them will result +// in silent kernel failures. +// - Compiling with MSVC and using +=,-=,*=,/=(std::complex) will lead +// to duplicate definition errors, since these are already specialized in +// Visual Studio's header (contrary to the standard). This is +// preferable to removing such definitions, which will lead to silent kernel +// failures. +// - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior +// to the first inclusion of . #if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE) @@ -67,27 +78,26 @@ std::complex complex_divide_fast(const std::complex& a, const std::complex const T a_imag = numext::imag(a); const T b_real = numext::real(b); const T b_imag = numext::imag(b); - const T norm = T(1) / (b_real * b_real + b_imag * b_imag); - return std::complex((a_real * b_real + a_imag * b_imag) * norm, - (a_imag * b_real - a_real * b_imag) * norm); + const T norm = (b_real * b_real + b_imag * b_imag); + return std::complex((a_real * b_real + a_imag * b_imag) / norm, + (a_imag * b_real - a_real * b_imag) / norm); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex complex_divide_stable(const std::complex& a, const std::complex& b) { + const T a_real = numext::real(a); + const T a_imag = numext::imag(a); const T b_real = numext::real(b); const T b_imag = numext::imag(b); - // Guard against over/under-flow. - const T scale = T(1) / (numext::abs(b_real) + numext::abs(b_imag)); - const T a_real_scaled = numext::real(a) * scale; - const T a_imag_scaled = numext::imag(a) * scale; - const T b_real_scaled = b_real * scale; - const T b_imag_scaled = b_imag * scale; - - const T b_norm2_scaled = b_real_scaled * b_real_scaled + b_imag_scaled * b_imag_scaled; - return std::complex( - (a_real_scaled * b_real_scaled + a_imag_scaled * b_imag_scaled) / b_norm2_scaled, - (a_imag_scaled * b_real_scaled - a_real_scaled * b_imag_scaled) / b_norm2_scaled); + // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf), + // guards against over/under-flow. + const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real); + const T rscale = scale_imag ? T(1) : b_real / b_imag; + const T iscale = scale_imag ? b_imag / b_real : T(1); + const T denominator = b_real * rscale + b_imag * iscale; + return std::complex((a_real * rscale + a_imag * iscale) / denominator, + (a_imag * rscale - a_real * iscale) / denominator); } template diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h index aac60f15cc2801c1aacdc50053e38fdfc433decd..f21d1a0a32ccea5f965d9a2a1f8e42449399d661 100644 --- a/Eigen/src/Core/arch/Default/BFloat16.h +++ b/Eigen/src/Core/arch/Default/BFloat16.h @@ -250,17 +250,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) { output.value = std::signbit(v) ? 0xFFC0: 0x7FC0; return output; - } else if (std::fabs(v) < std::numeric_limits::min EIGEN_NOT_A_MACRO()) { - // Flush denormal to +/- 0. - output.value = std::signbit(v) ? 0x8000 : 0; - return output; } - const uint16_t* p = reinterpret_cast(&v); -#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - output.value = p[0]; -#else - output.value = p[1]; -#endif + output.value = static_cast(numext::bit_cast(v) >> 16); return output; } @@ -288,9 +279,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne::min EIGEN_NOT_A_MACRO()) { - // Flush denormal to +/- 0.0 - output.value = std::signbit(ff) ? 0x8000 : 0; } else { // Fast rounding algorithm that rounds a half value to nearest even. This // reduces expected error when we convert a large number of floats. Here @@ -469,14 +457,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(&result); -#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ - q[0] = h.value; -#else - q[1] = h.value; -#endif - return result; + return numext::bit_cast(static_cast(h.value) << 16); } // --- standard functions --- diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h index 4cfe34e05268c459645c00449b0a92b23f1d0c2b..53830b5a274c72546315eaea5d1fb4fb737e3a8b 100644 --- a/Eigen/src/Core/arch/Default/ConjHelper.h +++ b/Eigen/src/Core/arch/Default/ConjHelper.h @@ -11,19 +11,107 @@ #ifndef EIGEN_ARCH_CONJ_HELPER_H #define EIGEN_ARCH_CONJ_HELPER_H -#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); } \ - }; \ - \ - template<> struct conj_helper { \ - EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \ - { return padd(c, pmul(x,y)); } \ - EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \ - { return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); } \ +#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, \ + const PACKET_CPLX& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, \ + const PACKET_CPLX& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); \ + } \ + }; \ + \ + template <> \ + struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, \ + const PACKET_REAL& y, \ + const PACKET_CPLX& c) const { \ + return padd(c, this->pmul(x, y)); \ + } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, \ + const PACKET_REAL& y) const { \ + return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); \ + } \ }; -#endif // EIGEN_ARCH_CONJ_HELPER_H +namespace Eigen { +namespace internal { + +template struct conj_if; + +template<> struct conj_if { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); } +}; + +template<> struct conj_if { + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; } +}; + +// Generic Implementation, assume scalars since the packet-version is +// specialized below. +template +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const + { return this->pmul(x, y) + c; } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsType& x, const RhsType& y) const + { return conj_if()(x) * conj_if()(y); } +}; + +template +struct conj_helper { + typedef typename ScalarBinaryOpTraits::ReturnType ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const + { return this->pmul(x, y) + c; } + + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType + pmul(const LhsScalar& x, const RhsScalar& y) const + { return numext::conj(x * y); } +}; + +// Implementation with equal type, use packet operations. +template +struct conj_helper +{ + typedef Packet ResultType; + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(conj_if().pconj(x), conj_if().pconj(y), c); } + + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return Eigen::internal::pmul(conj_if().pconj(x), conj_if().pconj(y)); } +}; + +template +struct conj_helper +{ + typedef Packet ResultType; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const + { return Eigen::internal::pmadd(pconj(x), pconj(y), c); } + // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b). + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const + { return pconj(Eigen::internal::pmul(x, y)); } +}; + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index 87e8c27033b27466ba36a312c684ed61ef06de9d..95fb686a1cae36fef9277594042f7139ec0b5777 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -19,12 +19,6 @@ namespace Eigen { namespace internal { -template EIGEN_DEVICE_FUNC inline Packet -pset(const typename unpacket_traits::type (&a)[N] /* a */) { - EIGEN_STATIC_ASSERT(unpacket_traits::size == N, THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE); - return pload(a); -} - // Creates a Scalar integer type with same bit-width. template struct make_integer; template<> struct make_integer { typedef numext::int32_t type; }; @@ -808,9 +802,8 @@ Packet psqrt_complex(const Packet& a) { // l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)), // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1. - Packet a_flip = pcplxflip(a); RealPacket a_abs = pabs(a.v); // [|x0|, |y0|, |x1|, |y1|] - RealPacket a_abs_flip = pabs(a_flip.v); // [|y0|, |x0|, |y1|, |x1|] + RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|] RealPacket a_max = pmax(a_abs, a_abs_flip); RealPacket a_min = pmin(a_abs, a_abs_flip); RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min)); @@ -839,7 +832,8 @@ Packet psqrt_complex(const Packet& a) { // Step 4. Compute solution for inputs with negative real part: // [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1] - const RealPacket cst_imag_sign_mask = pset1(Scalar(RealScalar(0.0), RealScalar(-0.0))).v; + const RealScalar neg_zero = RealScalar(numext::bit_cast(0x80000000u)); + const RealPacket cst_imag_sign_mask = pset1(Scalar(RealScalar(0.0), neg_zero)).v; RealPacket imag_signs = pand(a.v, cst_imag_sign_mask); Packet negative_real_result; // Notice that rho is positive, so taking it's absolute value is a noop. @@ -1449,39 +1443,40 @@ EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) { } // Generic implementation of pow(x,y). -template -EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS -EIGEN_UNUSED -Packet generic_pow(const Packet& x, const Packet& y) { +template +EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) { typedef typename unpacket_traits::type Scalar; const Packet cst_pos_inf = pset1(NumTraits::infinity()); + const Packet cst_neg_inf = pset1(-NumTraits::infinity()); const Packet cst_zero = pset1(Scalar(0)); const Packet cst_one = pset1(Scalar(1)); const Packet cst_nan = pset1(NumTraits::quiet_NaN()); const Packet abs_x = pabs(x); // Predicates for sign and magnitude of x. - const Packet x_is_zero = pcmp_eq(x, cst_zero); - const Packet x_is_neg = pcmp_lt(x, cst_zero); + const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero); + const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf); + const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero); + const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero); const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf); - const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); + const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one); const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x); const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one); - const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); - const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); + const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg); + const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg); const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x)); // Predicates for sign and magnitude of y. + const Packet abs_y = pabs(y); const Packet y_is_one = pcmp_eq(y, cst_one); - const Packet y_is_zero = pcmp_eq(y, cst_zero); + const Packet abs_y_is_zero = pcmp_eq(abs_y, cst_zero); const Packet y_is_neg = pcmp_lt(y, cst_zero); - const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg)); + const Packet y_is_pos = pandnot(ptrue(y), por(abs_y_is_zero, y_is_neg)); const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y)); - const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf); + const Packet abs_y_is_inf = pcmp_eq(abs_y, cst_pos_inf); EIGEN_CONSTEXPR Scalar huge_exponent = - (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / - NumTraits::epsilon(); + (NumTraits::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits::epsilon(); const Packet abs_y_is_huge = pcmp_le(pset1(huge_exponent), pabs(y)); // Predicates for whether y is integer and/or even. @@ -1490,39 +1485,31 @@ Packet generic_pow(const Packet& x, const Packet& y) { const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2); // Predicates encoding special cases for the value of pow(x,y) - const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), - y_is_int), - abs_y_is_inf); - const Packet pow_is_one = por(por(x_is_one, y_is_zero), - pand(x_is_neg_one, - por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf); const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan)); - const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos), - pand(abs_x_is_inf, y_is_neg)), - pand(pand(abs_x_is_lt_one, abs_y_is_huge), - y_is_pos)), - pand(pand(abs_x_is_gt_one, abs_y_is_huge), - y_is_neg)); - const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg), - pand(abs_x_is_inf, y_is_pos)), - pand(pand(abs_x_is_lt_one, abs_y_is_huge), - y_is_neg)), - pand(pand(abs_x_is_gt_one, abs_y_is_huge), - y_is_pos)); + const Packet pow_is_one = + por(por(x_is_one, abs_y_is_zero), pand(x_is_neg_one, por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x)))); + const Packet pow_is_zero = por(por(por(pand(abs_x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg)); + const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)), + pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)), + pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos)); + const Packet inf_val = + pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even), + cst_neg_inf, cst_pos_inf); // General computation of pow(x,y) for positive x or negative x and integer y. const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even); const Packet pow_abs = generic_pow_impl(abs_x, y); - return pselect(y_is_one, x, - pselect(pow_is_one, cst_one, - pselect(pow_is_nan, cst_nan, - pselect(pow_is_inf, cst_pos_inf, - pselect(pow_is_zero, cst_zero, - pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); + return pselect( + y_is_one, x, + pselect(pow_is_one, cst_one, + pselect(pow_is_nan, cst_nan, + pselect(pow_is_inf, inf_val, + pselect(pow_is_zero, cst_zero, pselect(negate_pow_abs, pnegate(pow_abs), pow_abs)))))); } - - /* polevl (modified for Eigen) * * Evaluate polynomial diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h index 637e5f4afcc0fb8ebbe7158a0bb1e3daa2d2b1ee..177a04e93e646cf735e1816857ea16de1cd8a4da 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h @@ -17,10 +17,6 @@ namespace internal { // implemented in GenericPacketMathFunctions.h // This is needed to workaround a circular dependency. -/** \internal \returns a packet with constant coefficients \a a, e.g.: (a[N-1],...,a[0]) */ -template EIGEN_DEVICE_FUNC inline Packet -pset(const typename unpacket_traits::type (&a)[N] /* a */); - /*************************************************************************** * Some generic implementations to be used by implementors ***************************************************************************/ diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h index 9f8e8cc1e7a614cf86b4d4e7065df97fb06404c3..6e2b31f7625407974230797e712c73aa0c4b0ae9 100644 --- a/Eigen/src/Core/arch/Default/Half.h +++ b/Eigen/src/Core/arch/Default/Half.h @@ -36,8 +36,6 @@ #ifndef EIGEN_HALF_H #define EIGEN_HALF_H -#include - #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) // When compiling with GPU support, the "__half_raw" base class as well as // some other routines are defined in the GPU compiler header files @@ -334,7 +332,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { } #endif -#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) +#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); } @@ -534,7 +532,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { #elif defined(EIGEN_HAS_FP16_C) __half_raw h; - h.x = _cvtss_sh(ff, 0); + #if EIGEN_COMP_MSVC + // MSVC does not have scalar instructions. + h.x =_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0); + #else + h.x = _cvtss_sh(ff, 0); + #endif return h; #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) @@ -595,7 +598,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) - return _cvtsh_ss(h.x); + #if EIGEN_COMP_MSVC + // MSVC does not have scalar instructions. + return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x))); + #else + return _cvtsh_ss(h.x); + #endif #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) return static_cast(h.x); #else diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h index 689110dede13d0e26eb181e4de03cf3a8bc011c1..bfc11efbca83431f6b3839a06e15796ad0853bea 100644 --- a/Eigen/src/Core/arch/GPU/PacketMath.h +++ b/Eigen/src/Core/arch/GPU/PacketMath.h @@ -121,7 +121,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1(const do // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation // of the functions, while the latter can only deal with one of them. #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) -namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) { @@ -180,8 +179,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull); } -} // namespace - template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand(const float4& a, const float4& b) { @@ -493,9 +490,10 @@ ptranspose(PacketBlock& kernel) { #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU) -// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning -// its corresponding packet_traits must be visible on host. -#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +// Half-packet functions are not available on the host for CUDA 9.0-9.2, only +// on device. There is no benefit to using them on the host anyways, since they are +// emulated. +#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) typedef ulonglong2 Packet4h2; template<> struct unpacket_traits { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; }; @@ -526,42 +524,9 @@ template<> struct packet_traits : default_packet_traits }; }; -namespace { -// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist. -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __halves2half2(a, b); -#else - // Round-about way since __halves2half2 is a __device__ function. - return __floats2half2_rn(__half2float(a), __half2float(b)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __low2half(a); -#else - return __float2half(__low2float(a)); -#endif -} - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return __high2half(a); -#else - return __float2half(__high2float(a)); -#endif -} -} // namespace - template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1(const Eigen::half& from) { -#if defined(EIGEN_GPU_COMPILE_PHASE) return __half2half2(from); -#else - const float f = __half2float(from); - return __floats2half2_rn(f, f); -#endif } template <> @@ -576,8 +541,6 @@ pset1(const Eigen::half& from) { return r; } -// We now need this visible on both host and device. -// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) namespace { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { @@ -585,11 +548,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { - return combine_half(from[0], from[1]); + return __halves2half2(from[0], from[1]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) { - return combine_half(from[0], from[0]); + return __halves2half2(from[0], from[0]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, @@ -599,8 +562,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) { - to[0] = get_half2_low(from); - to[1] = get_half2_high(from); + to[0] = __low2half(from); + to[1] = __high2half(from); } @@ -610,7 +573,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned( // Input is guaranteed to be properly aligned. return __ldg(reinterpret_cast(from)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } @@ -619,31 +582,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned( #if defined(EIGEN_GPU_HAS_LDG) return __halves2half2(__ldg(from+0), __ldg(from+1)); #else - return combine_half(*(from+0), *(from+1)); + return __halves2half2(*(from+0), *(from+1)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) { - return combine_half(from[0*stride], from[1*stride]); + return __halves2half2(from[0*stride], from[1*stride]); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter( Eigen::half* to, const half2& from, Index stride) { - to[stride*0] = get_half2_low(from); - to[stride*1] = get_half2_high(from); + to[stride*0] = __low2half(from); + to[stride*1] = __high2half(from); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { - return get_half2_low(a); + return __low2half(a); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); + half a1 = __low2half(a); + half a2 = __high2half(a); half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF); half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) { @@ -658,12 +621,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - __half a1 = get_half2_low(kernel.packet[0]); - __half a2 = get_half2_high(kernel.packet[0]); - __half b1 = get_half2_low(kernel.packet[1]); - __half b2 = get_half2_high(kernel.packet[1]); - kernel.packet[0] = combine_half(a1, b1); - kernel.packet[1] = combine_half(a2, b2); + __half a1 = __low2half(kernel.packet[0]); + __half a2 = __high2half(kernel.packet[0]); + __half b1 = __low2half(kernel.packet[1]); + __half b2 = __high2half(kernel.packet[1]); + kernel.packet[0] = __halves2half2(a1, b1); + kernel.packet[1] = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { @@ -671,88 +634,88 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) { return __halves2half2(a, __hadd(a, __float2half(1.0f))); #else float f = __half2float(a) + 1.0f; - return combine_half(a, __float2half(f)); + return __halves2half2(a, __float2half(f)); #endif } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) { - half mask_low = get_half2_low(mask); - half mask_high = get_half2_high(mask); - half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a); - half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a); - return combine_half(result_low, result_high); + half mask_low = __low2half(mask); + half mask_high = __high2half(mask); + half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a); + half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a); + return __halves2half2(result_low, result_high); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) { half true_half = half_impl::raw_uint16_to_half(0xffffu); half false_half = half_impl::raw_uint16_to_half(0x0000u); - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half; half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half; - return combine_half(eq1, eq2); + return __halves2half2(eq1, eq2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) { - half a1 = get_half2_low(a); - half a2 = get_half2_high(a); - half b1 = get_half2_low(b); - half b2 = get_half2_high(b); + half a1 = __low2half(a); + half a2 = __high2half(a); + half b1 = __low2half(b); + half b2 = __high2half(b); half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x); half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x); - return combine_half(result1, result2); + return __halves2half2(result1, result2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, @@ -851,9 +814,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, @@ -862,9 +825,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) { @@ -885,7 +848,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 > a2 ? get_half2_low(a) : get_half2_high(a); + return a1 > a2 ? __low2half(a) : __high2half(a); #endif } @@ -897,7 +860,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) { #else float a1 = __low2float(a); float a2 = __high2float(a); - return a1 < a2 ? get_half2_low(a) : get_half2_high(a); + return a1 < a2 ? __low2half(a) : __high2half(a); #endif } @@ -1068,10 +1031,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather(const Eigen::half* from, Index stride) { Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(from[0 * stride], from[1 * stride]); - p_alias[1] = combine_half(from[2 * stride], from[3 * stride]); - p_alias[2] = combine_half(from[4 * stride], from[5 * stride]); - p_alias[3] = combine_half(from[6 * stride], from[7 * stride]); + p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]); + p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]); + p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]); + p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]); return r; } @@ -1152,12 +1115,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2( EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) { - __half a1 = get_half2_low(f0); - __half a2 = get_half2_high(f0); - __half b1 = get_half2_low(f1); - __half b2 = get_half2_high(f1); - f0 = combine_half(a1, b1); - f1 = combine_half(a2, b2); + __half a1 = __low2half(f0); + __half a2 = __high2half(f0); + __half b1 = __low2half(f1); + __half b2 = __high2half(f1); + f0 = __halves2half2(a1, b1); + f1 = __halves2half2(a2, b2); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void @@ -1254,10 +1217,10 @@ plset(const Eigen::half& a) { float f = __half2float(a); Packet4h2 r; half2* p_alias = reinterpret_cast(&r); - p_alias[0] = combine_half(a, __float2half(f + 1.0f)); - p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f)); - p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f)); - p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f)); + p_alias[0] = __halves2half2(a, __float2half(f + 1.0f)); + p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f)); + p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f)); + p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f)); return r; #endif } @@ -1477,9 +1440,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_max(a_alias[0]), + half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1])); - half2 m1 = combine_half(predux_max(a_alias[2]), + half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3])); __half first = predux_max(m0); __half second = predux_max(m1); @@ -1496,9 +1459,9 @@ template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min( const Packet4h2& a) { const half2* a_alias = reinterpret_cast(&a); - half2 m0 = combine_half(predux_min(a_alias[0]), + half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1])); - half2 m1 = combine_half(predux_min(a_alias[2]), + half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3])); __half first = predux_min(m0); __half second = predux_min(m1); @@ -1652,9 +1615,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 < b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 < b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } template<> @@ -1664,14 +1627,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, float a2 = __high2float(a); float b1 = __low2float(b); float b2 = __high2float(b); - __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b); - __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b); - return combine_half(r1, r2); + __half r1 = a1 > b1 ? __low2half(a) : __low2half(b); + __half r2 = a2 > b2 ? __high2half(a) : __high2half(b); + return __halves2half2(r1, r2); } -// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC) - -#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16) +#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE) #undef EIGEN_GPU_HAS_LDG #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h index 75454622559cece4eed6303bed8fdb37ab2aecd8..c8195bb2b094633dc03cc71c0144ce43f23a6d66 100644 --- a/Eigen/src/Core/arch/GPU/TypeCasting.h +++ b/Eigen/src/Core/arch/GPU/TypeCasting.h @@ -15,8 +15,7 @@ namespace Eigen { namespace internal { #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \ - (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) - + (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE)) template <> struct type_casting_traits { diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h index 4877a95a8d042e827b5809b5cfd94830690b9623..53dacfa43d8932e31c2ba1cf5dac9a09bef3237e 100644 --- a/Eigen/src/Core/arch/MSA/Complex.h +++ b/Eigen/src/Core/arch/MSA/Complex.h @@ -305,42 +305,6 @@ EIGEN_STRONG_INLINE std::complex predux_mul(const Packet2cf& a (a.v[0] * a.v[3]) + (a.v[1] * a.v[2])); } -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, - const Packet2cf& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { - return internal::pmul(a, pconj(b)); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, - const Packet2cf& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { - return internal::pmul(pconj(a), b); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, - const Packet2cf& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f) template <> @@ -644,42 +608,6 @@ EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& return pfirst(a); } -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, - const Packet1cd& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { - return internal::pmul(a, pconj(b)); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, - const Packet1cd& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { - return internal::pmul(pconj(a), b); - } -}; - -template <> -struct conj_helper { - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, - const Packet1cd& c) const { - return padd(pmul(x, y), c); - } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d) template <> diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h index f03cf61ffb6980691642730c79655da26c5c44db..afe8f3375ba9bcfcaf208222ac3fd52ab03433d8 100644 --- a/Eigen/src/Core/arch/MSA/PacketMath.h +++ b/Eigen/src/Core/arch/MSA/PacketMath.h @@ -28,10 +28,6 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 1aa361bc063eaa97542c67d013ad96927b679286..a58f13ca85028efeb1c57057964753176d48c831 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -124,24 +124,17 @@ template<> EIGEN_STRONG_INLINE Packet1cf psub(const Packet1cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub(a.v, b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b); -template<> EIGEN_STRONG_INLINE Packet2cf paddsub(const Packet2cf& a, const Packet2cf& b) -{ - Packet4f mask = {-0.0f, -0.0f, 0.0f, 0.0f}; - return Packet2cf(padd(a.v, pxor(mask, b.v))); -} - template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); } template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) { - const Packet2ui b = vreinterpret_u32_f32(a.v); + const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v)); return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR()))); } template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { - const Packet4ui b = vreinterpretq_u32_f32(a.v); + const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v)); return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR()))); } @@ -349,67 +342,13 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return s; } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const - { return internal::pmul(a, pconj(b)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const - { return internal::pmul(pconj(a), b); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const - { return pconj(internal::pmul(a,b)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { return internal::pmul(a, pconj(b)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { return internal::pmul(pconj(a), b); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { return pconj(internal::pmul(a,b)); } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f) EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, const Packet1cf& b) { // TODO optimize it for NEON - Packet1cf res = conj_helper().pmul(a,b); + Packet1cf res = pmul(a, pconj(b)); Packet2f s, rev_s; // this computes the norm @@ -421,7 +360,7 @@ template<> EIGEN_STRONG_INLINE Packet1cf pdiv(const Packet1cf& a, con template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = pmul(a,pconj(b)); Packet4f s, rev_s; // this computes the norm @@ -610,39 +549,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux(const Pack template<> EIGEN_STRONG_INLINE std::complex predux_mul(const Packet1cd& a) { return pfirst(a); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { return internal::pmul(a, pconj(b)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { return internal::pmul(pconj(a), b); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { return pconj(internal::pmul(a,b)); } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for NEON - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = pmul(b.v, b.v); Packet2d rev_s = preverse(s); diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h index 3481f337e374c1fc3eb0b9e780ddb02da963a9e5..0963b0f1f0fbd009933fbe38b2327afbba98bf8a 100644 --- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h @@ -24,7 +24,7 @@ struct gebp_traits template EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, - Packet4f& c, Packet4f& tmp, + Packet4f& c, Packet4f&, const LaneIdType&) const { acc(a, b, c); } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 9cf4e071210f3ddc09d88985ae33cb724090695c..f6d6d635a605996571fe7df4dc7a5c4ca663d0ad 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -24,10 +24,6 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #if EIGEN_ARCH_ARM64 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 @@ -36,7 +32,7 @@ namespace internal { #endif #endif -#if EIGEN_COMP_MSVC +#if EIGEN_COMP_MSVC_STRICT // In MSVC's arm_neon.h header file, all NEON vector types // are aliases to the same underlying type __n128. @@ -82,11 +78,21 @@ typedef uint32x4_t Packet4ui; typedef int64x2_t Packet2l; typedef uint64x2_t Packet2ul; -#endif // EIGEN_COMP_MSVC +#endif // EIGEN_COMP_MSVC_STRICT + +EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { + float from[4] = {a, b, c, d}; + return vld1q_f32(from); +} + +EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { + float from[2] = {a, b}; + return vld1_f32(from); +} EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){ const float* a = reinterpret_cast(&m); - Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))); return res; } @@ -99,7 +105,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); - Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); return res; } @@ -108,7 +114,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n { const float* a = reinterpret_cast(&m); const float* b = reinterpret_cast(&n); - Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))}; + Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))); return res; } @@ -139,7 +145,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b return shuffle2(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3)); } #define vec4f_duplane(a, p) \ - vdupq_lane_f32(vget_low_f32(a), p) + Packet4f(vdupq_lane_f32(vget_low_f32(a), p)) #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) @@ -150,7 +156,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC // __builtin_prefetch tends to do nothing on ARM64 compilers because the // prefetch instructions there are too detailed for __builtin_prefetch to map // meaningfully to them. @@ -159,7 +165,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif EIGEN_ARCH_ARM32 +#elif EIGEN_ARCH_ARM #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching @@ -866,12 +872,12 @@ template<> EIGEN_STRONG_INLINE Packet2ul psub(const Packet2ul& a, con template<> EIGEN_STRONG_INLINE Packet2f pxor(const Packet2f& a, const Packet2f& b); template<> EIGEN_STRONG_INLINE Packet2f paddsub(const Packet2f& a, const Packet2f & b) { - Packet2f mask = {-0.0f, 0.0f}; + Packet2f mask = make_packet2f(numext::bit_cast(0x80000000u), 0.0f); return padd(a, pxor(mask, b)); } template<> EIGEN_STRONG_INLINE Packet4f pxor(const Packet4f& a, const Packet4f& b); template<> EIGEN_STRONG_INLINE Packet4f paddsub(const Packet4f& a, const Packet4f& b) { - Packet4f mask = {-0.0f, 0.0f, -0.0f, 0.0f}; + Packet4f mask = make_packet4f(numext::bit_cast(0x80000000u), 0.0f, numext::bit_cast(0x80000000u), 0.0f); return padd(a, pxor(mask, b)); } @@ -2503,7 +2509,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(co template<> EIGEN_STRONG_INLINE float predux_mul(const Packet2f& a) { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); } template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) -{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } +{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); } template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet4c& a) { int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a)); @@ -2517,7 +2523,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet8c& a) return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4); } template<> EIGEN_STRONG_INLINE int8_t predux_mul(const Packet16c& a) -{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } +{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); } template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet4uc& a) { uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a)); @@ -2531,7 +2537,7 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet8uc& a) return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4); } template<> EIGEN_STRONG_INLINE uint8_t predux_mul(const Packet16uc& a) -{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } +{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); } template<> EIGEN_STRONG_INLINE int16_t predux_mul(const Packet4s& a) { const int16x4_t prod = vmul_s16(a, vrev32_s16(a)); @@ -2567,11 +2573,11 @@ template<> EIGEN_STRONG_INLINE uint16_t predux_mul(const Packet8us& a template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet2i& a) { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); } template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) -{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } +{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); } template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet2ui& a) { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); } template<> EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) -{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } +{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); } template<> EIGEN_STRONG_INLINE int64_t predux_mul(const Packet2l& a) { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); } template<> EIGEN_STRONG_INLINE uint64_t predux_mul(const Packet2ul& a) @@ -2774,352 +2780,265 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) return vget_lane_u32(vpmax_u32(tmp, tmp), 0); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const float32x2x2_t z = vzip_f32(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = z.val[0]; - kernel.packet[1] = z.val[1]; +// Helpers for ptranspose. +namespace detail { + +template +void zip_in_place(Packet& p1, Packet& p2); + +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2f& p1, Packet2f& p2) { + const float32x2x2_t tmp = vzip_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]); - const float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0])); - kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0])); - kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1])); - kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1])); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4f& p1, Packet4f& p2) { + const float32x4x2_t tmp = vzipq_f32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1)); - const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1)); - const int8x8x2_t zip8 = vzip_s8(a,b); - const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1])); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8c& p1, Packet8c& p2) { + const int8x8x2_t tmp = vzip_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0); - kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1); - kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0); - kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16c& p1, Packet16c& p2) { + const int8x16x2_t tmp = vzipq_s8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - int8x8x2_t zip8[4]; - uint16x4x2_t zip16[4]; - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - zip8[i] = vzip_s8(kernel.packet[i*2], kernel.packet[i*2+1]); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8uc& p1, Packet8uc& p2) { + const uint8x8x2_t tmp = vzip_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - zip16[i*2+j] = vzip_u16(vreinterpret_u16_s8(zip8[i*2].val[j]), vreinterpret_u16_s8(zip8[i*2+1].val[j])); - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet16uc& p1, Packet16uc& p2) { + const uint8x16x2_t tmp = vzipq_u8(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j])); - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - kernel.packet[i*4+j*2+k] = vreinterpret_s8_u32(z.val[k]); - } - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2i& p1, Packet2i& p2) { + const int32x2x2_t tmp = vzip_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - int8x16x2_t zip8[8]; - uint16x8x2_t zip16[8]; - uint32x4x2_t zip32[8]; - EIGEN_UNROLL_LOOP - for (int i = 0; i != 8; i++) - zip8[i] = vzipq_s8(kernel.packet[i*2], kernel.packet[i*2+1]); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4i& p1, Packet4i& p2) { + const int32x4x2_t tmp = vzipq_s32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_s8(zip8[i*2].val[j]), - vreinterpretq_u16_s8(zip8[i*2+1].val[j])); - } - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet2ui& p1, Packet2ui& p2) { + const uint32x2x2_t tmp = vzip_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - zip32[i*4+j*2+k] = vzipq_u32(vreinterpretq_u32_u16(zip16[i*4+j].val[k]), - vreinterpretq_u32_u16(zip16[i*4+j+2].val[k])); - } - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4ui& p1, Packet4ui& p2) { + const uint32x4x2_t tmp = vzipq_u32(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - kernel.packet[i*4+j*2] = vreinterpretq_s8_u32(vcombine_u32(vget_low_u32(zip32[i].val[j]), - vget_low_u32(zip32[i+4].val[j]))); - kernel.packet[i*4+j*2+1] = vreinterpretq_s8_u32(vcombine_u32(vget_high_u32(zip32[i].val[j]), - vget_high_u32(zip32[i+4].val[j]))); - } - } +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4s& p1, Packet4s& p2) { + const int16x4x2_t tmp = vzip_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1)); - const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1)); - const uint8x8x2_t zip8 = vzip_u8(a,b); - const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1])); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8s& p1, Packet8s& p2) { + const int16x8x2_t tmp = vzipq_s16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0); - kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1); - kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0); - kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4us& p1, Packet4us& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - uint8x8x2_t zip8[4]; - uint16x4x2_t zip16[4]; - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - zip8[i] = vzip_u8(kernel.packet[i*2], kernel.packet[i*2+1]); +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet8us& p1, Packet8us& p2) { + const uint16x8x2_t tmp = vzipq_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - zip16[i*2+j] = vzip_u16(vreinterpret_u16_u8(zip8[i*2].val[j]), vreinterpret_u16_u8(zip8[i*2+1].val[j])); - } +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[1]); +} - EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { - const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j])); - EIGEN_UNROLL_LOOP - for (int k = 0; k != 2; k++) - kernel.packet[i*4+j*2+k] = vreinterpret_u8_u32(z.val[k]); - } - } +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - uint8x16x2_t zip8[8]; - uint16x8x2_t zip16[8]; - uint32x4x2_t zip32[8]; - EIGEN_UNROLL_LOOP - for (int i = 0; i != 8; i++) - zip8[i] = vzipq_u8(kernel.packet[i*2], kernel.packet[i*2+1]); +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { + zip_in_place(kernel.packet[0], kernel.packet[4]); + zip_in_place(kernel.packet[1], kernel.packet[5]); + zip_in_place(kernel.packet[2], kernel.packet[6]); + zip_in_place(kernel.packet[3], kernel.packet[7]); - EIGEN_UNROLL_LOOP - for (int i = 0; i != 4; i++) - { - EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_u8(zip8[i*2].val[j]), - vreinterpretq_u16_u8(zip8[i*2+1].val[j])); - } + zip_in_place(kernel.packet[0], kernel.packet[2]); + zip_in_place(kernel.packet[1], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[6]); + zip_in_place(kernel.packet[5], kernel.packet[7]); + + zip_in_place(kernel.packet[0], kernel.packet[1]); + zip_in_place(kernel.packet[2], kernel.packet[3]); + zip_in_place(kernel.packet[4], kernel.packet[5]); + zip_in_place(kernel.packet[6], kernel.packet[7]); +} +template +EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock& kernel) { EIGEN_UNROLL_LOOP - for (int i = 0; i != 2; i++) - { + for (int i=0; i<4; ++i) { + const int m = (1 << i); EIGEN_UNROLL_LOOP - for (int j = 0; j != 2; j++) - { + for (int j=0; j& kernel) -{ - const int16x4x2_t zip16_1 = vzip_s16(kernel.packet[0], kernel.packet[1]); - const int16x4x2_t zip16_2 = vzip_s16(kernel.packet[2], kernel.packet[3]); - const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[0]), vreinterpret_u32_s16(zip16_2.val[0])); - const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[1]), vreinterpret_u32_s16(zip16_2.val[1])); +} // namespace detail - kernel.packet[0] = vreinterpret_s16_u32(zip32_1.val[0]); - kernel.packet[1] = vreinterpret_s16_u32(zip32_1.val[1]); - kernel.packet[2] = vreinterpret_s16_u32(zip32_2.val[0]); - kernel.packet[3] = vreinterpret_s16_u32(zip32_2.val[1]); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]); - const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]); - - const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0])); - const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1])); - - kernel.packet[0] = vreinterpretq_s16_u32(zip32_1.val[0]); - kernel.packet[1] = vreinterpretq_s16_u32(zip32_1.val[1]); - kernel.packet[2] = vreinterpretq_s16_u32(zip32_2.val[0]); - kernel.packet[3] = vreinterpretq_s16_u32(zip32_2.val[1]); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - const int8x16x2_t zip8_1 = vzipq_s8(kernel.packet[0], kernel.packet[1]); - const int8x16x2_t zip8_2 = vzipq_s8(kernel.packet[2], kernel.packet[3]); + const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1)); + const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1)); - const int16x8x2_t zip16_1 = vzipq_s16(vreinterpretq_s16_s8(zip8_1.val[0]), vreinterpretq_s16_s8(zip8_2.val[0])); - const int16x8x2_t zip16_2 = vzipq_s16(vreinterpretq_s16_s8(zip8_1.val[1]), vreinterpretq_s16_s8(zip8_2.val[1])); + const int8x8x2_t zip8 = vzip_s8(a,b); + const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1])); - kernel.packet[0] = vreinterpretq_s8_s16(zip16_1.val[0]); - kernel.packet[1] = vreinterpretq_s8_s16(zip16_1.val[1]); - kernel.packet[2] = vreinterpretq_s8_s16(zip16_2.val[0]); - kernel.packet[3] = vreinterpretq_s8_s16(zip16_2.val[1]); + kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1); } - -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint8x16x2_t zip8_1 = vzipq_u8(kernel.packet[0], kernel.packet[1]); - const uint8x16x2_t zip8_2 = vzipq_u8(kernel.packet[2], kernel.packet[3]); - - const uint16x8x2_t zip16_1 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[0]), vreinterpretq_u16_u8(zip8_2.val[0])); - const uint16x8x2_t zip16_2 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[1]), vreinterpretq_u16_u8(zip8_2.val[1])); - - kernel.packet[0] = vreinterpretq_u8_u16(zip16_1.val[0]); - kernel.packet[1] = vreinterpretq_u8_u16(zip16_1.val[1]); - kernel.packet[2] = vreinterpretq_u8_u16(zip16_2.val[0]); - kernel.packet[3] = vreinterpretq_u8_u16(zip16_2.val[1]); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]); - const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]); - const int16x8x2_t zip16_3 = vzipq_s16(kernel.packet[4], kernel.packet[5]); - const int16x8x2_t zip16_4 = vzipq_s16(kernel.packet[6], kernel.packet[7]); + const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1)); + const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1)); - const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0])); - const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1])); - const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[0]), vreinterpretq_u32_s16(zip16_4.val[0])); - const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[1]), vreinterpretq_u32_s16(zip16_4.val[1])); + const uint8x8x2_t zip8 = vzip_u8(a,b); + const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1])); - kernel.packet[0] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0]))); - kernel.packet[1] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0]))); - kernel.packet[2] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1]))); - kernel.packet[3] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1]))); - kernel.packet[4] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0]))); - kernel.packet[5] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0]))); - kernel.packet[6] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1]))); - kernel.packet[7] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1]))); + kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0); + kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1); + kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0); + kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint16x4x2_t zip16_1 = vzip_u16(kernel.packet[0], kernel.packet[1]); - const uint16x4x2_t zip16_2 = vzip_u16(kernel.packet[2], kernel.packet[3]); - - const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[0]), vreinterpret_u32_u16(zip16_2.val[0])); - const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[1]), vreinterpret_u32_u16(zip16_2.val[1])); - - kernel.packet[0] = vreinterpret_u16_u32(zip32_1.val[0]); - kernel.packet[1] = vreinterpret_u16_u32(zip32_1.val[1]); - kernel.packet[2] = vreinterpret_u16_u32(zip32_2.val[0]); - kernel.packet[3] = vreinterpret_u16_u32(zip32_2.val[1]); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint16x8x2_t zip16_1 = vzipq_u16(kernel.packet[0], kernel.packet[1]); - const uint16x8x2_t zip16_2 = vzipq_u16(kernel.packet[2], kernel.packet[3]); - const uint16x8x2_t zip16_3 = vzipq_u16(kernel.packet[4], kernel.packet[5]); - const uint16x8x2_t zip16_4 = vzipq_u16(kernel.packet[6], kernel.packet[7]); - - const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[0]), vreinterpretq_u32_u16(zip16_2.val[0])); - const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[1]), vreinterpretq_u32_u16(zip16_2.val[1])); - const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[0]), vreinterpretq_u32_u16(zip16_4.val[0])); - const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[1]), vreinterpretq_u32_u16(zip16_4.val[1])); - kernel.packet[0] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0]))); - kernel.packet[1] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0]))); - kernel.packet[2] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1]))); - kernel.packet[3] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1]))); - kernel.packet[4] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0]))); - kernel.packet[5] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0]))); - kernel.packet[6] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1]))); - kernel.packet[7] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1]))); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const int32x2x2_t z = vzip_s32(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = z.val[0]; - kernel.packet[1] = z.val[1]; +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]); - const int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0])); - kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0])); - kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1])); - kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1])); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint32x2x2_t z = vzip_u32(kernel.packet[0], kernel.packet[1]); - kernel.packet[0] = z.val[0]; - kernel.packet[1] = z.val[1]; +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) -{ - const uint32x4x2_t tmp1 = vzipq_u32(kernel.packet[0], kernel.packet[1]); - const uint32x4x2_t tmp2 = vzipq_u32(kernel.packet[2], kernel.packet[3]); - kernel.packet[0] = vcombine_u32(vget_low_u32(tmp1.val[0]), vget_low_u32(tmp2.val[0])); - kernel.packet[1] = vcombine_u32(vget_high_u32(tmp1.val[0]), vget_high_u32(tmp2.val[0])); - kernel.packet[2] = vcombine_u32(vget_low_u32(tmp1.val[1]), vget_low_u32(tmp2.val[1])); - kernel.packet[3] = vcombine_u32(vget_high_u32(tmp1.val[1]), vget_high_u32(tmp2.val[1])); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::zip_in_place(kernel.packet[0], kernel.packet[1]); +} +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { + detail::ptranspose_impl(kernel); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { #if EIGEN_ARCH_ARM64 const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]); - const int64x2_t tmp2 = vzip2q_s64(kernel.packet[0], kernel.packet[1]); - + kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]); kernel.packet[0] = tmp1; - kernel.packet[1] = tmp2; #else const int64x1_t tmp[2][2] = { { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) }, @@ -3135,10 +3054,8 @@ ptranspose(PacketBlock& kernel) { #if EIGEN_ARCH_ARM64 const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]); - const uint64x2_t tmp2 = vzip2q_u64(kernel.packet[0], kernel.packet[1]); - + kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]); kernel.packet[0] = tmp1; - kernel.packet[1] = tmp2; #else const uint64x1_t tmp[2][2] = { { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) }, @@ -3468,11 +3385,20 @@ template<> struct unpacket_traits }; }; +namespace detail { +template<> +EIGEN_ALWAYS_INLINE void zip_in_place(Packet4bf& p1, Packet4bf& p2) { + const uint16x4x2_t tmp = vzip_u16(p1, p2); + p1 = tmp.val[0]; + p2 = tmp.val[1]; +} +} // namespace detail + EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) { // See the scalar implemention in BFloat16.h for a comprehensible explanation // of this fast rounding algorithm - Packet4ui input = reinterpret_cast(p); + Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p)); // lsb = (input >> 16) & 1 Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1)); @@ -3497,7 +3423,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) { - return reinterpret_cast(vshlq_n_u32(vmovl_u16(p), 16)); + return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16))); } EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { @@ -3505,21 +3431,21 @@ EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { } template<> EIGEN_STRONG_INLINE Packet4bf pset1(const bfloat16& from) { - return pset1(from.value); + return Packet4bf(pset1(from.value)); } template<> EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { - return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(from))); + return bfloat16_impl::raw_uint16_to_bfloat16(static_cast(pfirst(Packet4us(from)))); } template<> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) { - return pload(reinterpret_cast(from)); + return Packet4bf(pload(reinterpret_cast(from))); } template<> EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) { - return ploadu(reinterpret_cast(from)); + return Packet4bf(ploadu(reinterpret_cast(from))); } template<> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) @@ -3534,7 +3460,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu(bfloat16* to, const Packet template<> EIGEN_STRONG_INLINE Packet4bf ploaddup(const bfloat16* from) { - return ploaddup(reinterpret_cast(from)); + return Packet4bf(ploaddup(reinterpret_cast(from))); } template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) { @@ -3581,25 +3507,25 @@ template<> EIGEN_STRONG_INLINE Packet4bf plset(const bfloat16& a) } template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) { - return por(a, b); + return Packet4bf(por(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) { - return pxor(a, b); + return Packet4bf(pxor(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) { - return pand(a, b); + return Packet4bf(pand(Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) { - return pandnot(a, b); + return Packet4bf(pandnot(Packet4us(a), Packet4us(b))); } template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) { - return pselect(mask, a, b); + return Packet4bf(pselect(Packet4us(mask), Packet4us(a), Packet4us(b))); } template<> EIGEN_STRONG_INLINE Packet4bf print(const Packet4bf& a) @@ -3638,13 +3564,13 @@ template<> EIGEN_STRONG_INLINE Packet4bf pdiv(const Packet4bf& a, con template<> EIGEN_STRONG_INLINE Packet4bf pgather(const bfloat16* from, Index stride) { - return pgather(reinterpret_cast(from), stride); + return Packet4bf(pgather(reinterpret_cast(from), stride)); } template<> EIGEN_STRONG_INLINE void pscatter(bfloat16* to, const Packet4bf& from, Index stride) { - pscatter(reinterpret_cast(to), from, stride); + pscatter(reinterpret_cast(to), Packet4us(from), stride); } template<> EIGEN_STRONG_INLINE bfloat16 predux(const Packet4bf& a) @@ -3669,21 +3595,12 @@ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet4bf& a template<> EIGEN_STRONG_INLINE Packet4bf preverse(const Packet4bf& a) { - return preverse(a); + return Packet4bf(preverse(Packet4us(a))); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel) { - PacketBlock k; - k.packet[0] = kernel.packet[0]; - k.packet[1] = kernel.packet[1]; - k.packet[2] = kernel.packet[2]; - k.packet[3] = kernel.packet[3]; - ptranspose(k); - kernel.packet[0] = k.packet[0]; - kernel.packet[1] = k.packet[1]; - kernel.packet[2] = k.packet[2]; - kernel.packet[3] = k.packet[3]; + detail::ptranspose_impl(kernel); } template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff(const Packet4bf& a, const Packet4bf& b) @@ -3701,6 +3618,11 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt(const Packet4bf& a, return F32MaskToBf16Mask(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b))); } +template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan(const Packet4bf& a, const Packet4bf& b) +{ + return F32MaskToBf16Mask(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b))); +} + template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le(const Packet4bf& a, const Packet4bf& b) { return F32MaskToBf16Mask(pcmp_le(Bf16ToF32(a), Bf16ToF32(b))); @@ -3708,7 +3630,7 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le(const Packet4bf& a, template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) { - return pxor(a, pset1(static_cast(0x8000))); + return Packet4bf(pxor(Packet4us(a), pset1(static_cast(0x8000)))); } //---------- double ---------- @@ -3726,16 +3648,29 @@ template<> EIGEN_STRONG_INLINE Packet4bf pnegate(const Packet4bf& a) #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG +#if EIGEN_COMP_GNUC // Bug 907: workaround missing declarations of the following two functions in the ADK // Defining these functions as templates ensures that if these intrinsics are // already defined in arm_neon.h, then our workaround doesn't cause a conflict // and has lower priority in overload resolution. +// This doesn't work with MSVC though, since the function names are macros. template uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; } template float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; } +#endif +#if EIGEN_COMP_MSVC_STRICT +typedef eigen_packet_wrapper Packet2d; +typedef eigen_packet_wrapper Packet1d; +#else typedef float64x2_t Packet2d; typedef float64x1_t Packet1d; +#endif + +EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { + double from[2] = {a, b}; + return vld1q_f64(from); +} // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask)) // Currently used in LU/arch/InverseSize4.h to enable a shared implementation @@ -3744,7 +3679,7 @@ EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int m { const double* a = reinterpret_cast(&m); const double* b = reinterpret_cast(&n); - Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))}; + Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1))); return res; } @@ -3761,7 +3696,7 @@ EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b) return shuffle(a, b, 3); } #define vec2d_duplane(a, p) \ - vdupq_laneq_f64(a, p) + Packet2d(vdupq_laneq_f64(a, p)) template<> struct packet_traits : default_packet_traits { @@ -3835,7 +3770,7 @@ template<> EIGEN_STRONG_INLINE Packet2d psub(const Packet2d& a, const template<> EIGEN_STRONG_INLINE Packet2d pxor(const Packet2d& , const Packet2d& ); template<> EIGEN_STRONG_INLINE Packet2d paddsub(const Packet2d& a, const Packet2d& b){ - const Packet2d mask = {-0.0,0.0}; + const Packet2d mask = make_packet2d(numext::bit_cast(0x8000000000000000ull), 0.0); return padd(a, pxor(mask, b)); } @@ -3950,7 +3885,7 @@ template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) { return (vget_low_f64(a) * vget_high_f64(a))[0]; } #else template<> EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) -{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); } +{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); } #endif // min @@ -4006,7 +3941,7 @@ template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) { template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); } -#endif // EIGEN_ARCH_ARM64 +#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG // Do we have an fp16 types and supporting Neon intrinsics? #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h index 54f97336e03414097ebc4e5eaaa0e5545fed7276..c546466a137b003902070aa83ee1384e9bc7fb54 100644 --- a/Eigen/src/Core/arch/NEON/TypeCasting.h +++ b/Eigen/src/Core/arch/NEON/TypeCasting.h @@ -15,6 +15,113 @@ namespace Eigen { namespace internal { +//============================================================================== +// preinterpret +//============================================================================== +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { + return Packet2f(vreinterpret_f32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { + return Packet2f(vreinterpret_f32_u32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { + return Packet4f(vreinterpretq_f32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { + return Packet4f(vreinterpretq_f32_u32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { + return Packet8c(vreinterpret_s8_u8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { + return Packet16c(vreinterpretq_s8_u8(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { + return static_cast(a); +} +template <> +EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { + return Packet8uc(vreinterpret_u8_s8(a)); +} +template <> +EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { + return Packet16uc(vreinterpretq_u8_s8(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { + return Packet4s(vreinterpret_s16_u16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { + return Packet8s(vreinterpretq_s16_u16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { + return Packet4us(vreinterpret_u16_s16(a)); +} +template <> +EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { + return Packet8us(vreinterpretq_u16_s16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { + return Packet2i(vreinterpret_s32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { + return Packet2i(vreinterpret_s32_u32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { + return Packet4i(vreinterpretq_s32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { + return Packet4i(vreinterpretq_s32_u32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { + return Packet2ui(vreinterpret_u32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { + return Packet2ui(vreinterpret_u32_s32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { + return Packet4ui(vreinterpretq_u32_f32(a)); +} +template <> +EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { + return Packet4ui(vreinterpretq_u32_s32(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { + return Packet2l(vreinterpretq_s64_u64(a)); +} +template <> +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { + return Packet2ul(vreinterpretq_u64_s64(a)); +} + //============================================================================== // pcast, SrcType = float //============================================================================== @@ -188,7 +295,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet16c& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -212,11 +319,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet16c& a) { - return vreinterpretq_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet8c& a) { - return vreinterpret_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -240,11 +347,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8us pcast(const Packet16c& a) { - return vreinterpretq_u16_s16(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet4us pcast(const Packet8c& a) { - return vreinterpret_u16_s16(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -270,11 +377,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16uc pcast(const Packet16c& a) { - return vreinterpretq_u8_s8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet8uc pcast(const Packet8c& a) { - return vreinterpret_u8_s8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4uc pcast(const Packet4c& a) { @@ -315,7 +422,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet16uc& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -339,11 +446,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet16uc& a) { - return vreinterpretq_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet8uc& a) { - return vreinterpret_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -367,11 +474,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet16uc& a) { - return vreinterpretq_s16_u16(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet8uc& a) { - return vreinterpret_s16_u16(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -397,11 +504,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet16uc& a) { - return vreinterpretq_s8_u8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet8uc& a) { - return vreinterpret_s8_u8(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4c pcast(const Packet4uc& a) { @@ -442,7 +549,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet8s& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -466,11 +573,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet8s& a) { - return vreinterpretq_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet4s& a) { - return vreinterpret_u32_s32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -492,11 +599,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8us pcast(const Packet8s& a) { - return vreinterpretq_u16_s16(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4us pcast(const Packet4s& a) { - return vreinterpret_u16_s16(a); + return preinterpret(a); } template <> @@ -559,7 +666,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet8us& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -583,11 +690,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet8us& a) { - return vreinterpretq_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet4us& a) { - return vreinterpret_s32_u32(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -609,11 +716,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet8us& a) { - return vreinterpretq_s16_u16(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet4us& a) { - return vreinterpret_s16_u16(a); + return preinterpret(a); } template <> @@ -635,11 +742,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet8us& a, const Packet8us& b) { - return vreinterpretq_s8_u8(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet4us& a, const Packet4us& b) { - return vreinterpret_s8_u8(pcast(a, b)); + return preinterpret(pcast(a, b)); } //============================================================================== @@ -674,7 +781,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet4i& a) { - return vreinterpretq_u64_s64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -696,11 +803,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4ui pcast(const Packet4i& a) { - return vreinterpretq_u32_s32(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet2ui pcast(const Packet2i& a) { - return vreinterpret_u32_s32(a); + return preinterpret(a); } template <> @@ -799,7 +906,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet4ui& a) { - return vreinterpretq_s64_u64(pcast(a)); + return preinterpret(pcast(a)); } template <> @@ -821,11 +928,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4ui& a) { - return vreinterpretq_s32_u32(a); + return preinterpret(a); } template <> EIGEN_STRONG_INLINE Packet2i pcast(const Packet2ui& a) { - return vreinterpret_s32_u32(a); + return preinterpret(a); } template <> @@ -847,11 +954,11 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet4ui& a, const Packet4ui& b) { - return vreinterpretq_s16_u16(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> EIGEN_STRONG_INLINE Packet4s pcast(const Packet2ui& a, const Packet2ui& b) { - return vreinterpret_s16_u16(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> @@ -880,12 +987,12 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c, const Packet4ui& d) { - return vreinterpretq_s8_u8(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } template <> EIGEN_STRONG_INLINE Packet8c pcast(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c, const Packet2ui& d) { - return vreinterpret_s8_u8(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } //============================================================================== @@ -915,7 +1022,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2ul pcast(const Packet2l& a) { - return vreinterpretq_u64_s64(a); + return preinterpret(a); } template <> @@ -1013,7 +1120,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet2l pcast(const Packet2ul& a) { - return vreinterpretq_s64_u64(a); + return preinterpret(a); } template <> @@ -1031,7 +1138,7 @@ struct type_casting_traits { }; template <> EIGEN_STRONG_INLINE Packet4i pcast(const Packet2ul& a, const Packet2ul& b) { - return vreinterpretq_s32_u32(pcast(a, b)); + return preinterpret(pcast(a, b)); } template <> @@ -1053,7 +1160,7 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet8s pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, const Packet2ul& d) { - return vreinterpretq_s16_u16(pcast(a, b, c, d)); + return preinterpret(pcast(a, b, c, d)); } template <> @@ -1077,122 +1184,40 @@ template <> EIGEN_STRONG_INLINE Packet16c pcast(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c, const Packet2ul& d, const Packet2ul& e, const Packet2ul& f, const Packet2ul& g, const Packet2ul& h) { - return vreinterpretq_s8_u8(pcast(a, b, c, d, e, f, g, h)); + return preinterpret(pcast(a, b, c, d, e, f, g, h)); } +#if EIGEN_ARCH_ARM64 + //============================================================================== -// preinterpret +// pcast/preinterpret, Double //============================================================================== -template <> -EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2i& a) { - return vreinterpret_f32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2f preinterpret(const Packet2ui& a) { - return vreinterpret_f32_u32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4i& a) { - return vreinterpretq_f32_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4f preinterpret(const Packet4ui& a) { - return vreinterpretq_f32_u32(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4c preinterpret(const Packet4uc& a) { - return static_cast(a); -} -template <> -EIGEN_STRONG_INLINE Packet8c preinterpret(const Packet8uc& a) { - return vreinterpret_s8_u8(a); -} -template <> -EIGEN_STRONG_INLINE Packet16c preinterpret(const Packet16uc& a) { - return vreinterpretq_s8_u8(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4uc preinterpret(const Packet4c& a) { - return static_cast(a); -} -template <> -EIGEN_STRONG_INLINE Packet8uc preinterpret(const Packet8c& a) { - return vreinterpret_u8_s8(a); -} -template <> -EIGEN_STRONG_INLINE Packet16uc preinterpret(const Packet16c& a) { - return vreinterpretq_u8_s8(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4s preinterpret(const Packet4us& a) { - return vreinterpret_s16_u16(a); -} -template <> -EIGEN_STRONG_INLINE Packet8s preinterpret(const Packet8us& a) { - return vreinterpretq_s16_u16(a); -} - -template <> -EIGEN_STRONG_INLINE Packet4us preinterpret(const Packet4s& a) { - return vreinterpret_u16_s16(a); -} -template <> -EIGEN_STRONG_INLINE Packet8us preinterpret(const Packet8s& a) { - return vreinterpretq_u16_s16(a); -} - -template <> -EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2f& a) { - return vreinterpret_s32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet2i preinterpret(const Packet2ui& a) { - return vreinterpret_s32_u32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4f& a) { - return vreinterpretq_s32_f32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet4ui& a) { - return vreinterpretq_s32_u32(a); -} template <> -EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2f& a) { - return vreinterpret_u32_f32(a); +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { + return Packet2d(vreinterpretq_f64_s64(a)); } template <> -EIGEN_STRONG_INLINE Packet2ui preinterpret(const Packet2i& a) { - return vreinterpret_u32_s32(a); +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { + return Packet2d(vreinterpretq_f64_u64(a)); } template <> -EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4f& a) { - return vreinterpretq_u32_f32(a); +EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { + return Packet2l(vreinterpretq_s64_f64(a)); } template <> -EIGEN_STRONG_INLINE Packet4ui preinterpret(const Packet4i& a) { - return vreinterpretq_u32_s32(a); +EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { + return Packet2ul(vreinterpretq_u64_f64(a)); } - template <> -EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2ul& a) { - return vreinterpretq_s64_u64(a); +EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { + return Packet2d(vreinterpretq_f64_s32(a)); } template <> -EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2l& a) { - return vreinterpretq_u64_s64(a); +EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { + return Packet4i(vreinterpretq_s32_f64(a)); } -#if EIGEN_ARCH_ARM64 - -//============================================================================== -// pcast/preinterpret, Double -//============================================================================== - template <> struct type_casting_traits { enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; @@ -1314,7 +1339,9 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet16c& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_s8(a))); + // MSVC defines most intrinsics as macros, so we need to do this in two lines for portability. + Packet2f tmp = pcast(vget_low_s8(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1324,7 +1351,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet16uc& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_u8(a))); + Packet2f tmp = pcast(vget_low_u8(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1334,7 +1362,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet8s& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_s16(a))); + Packet2f tmp = pcast(vget_low_s16(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1344,7 +1373,8 @@ struct type_casting_traits { template <> EIGEN_STRONG_INLINE Packet2d pcast(const Packet8us& a) { // Discard all but first two values. - return vcvt_f64_f32(pcast(vget_low_u16(a))); + Packet2f tmp = pcast(vget_low_u16(a)); + return vcvt_f64_f32(tmp); } template <> @@ -1385,31 +1415,6 @@ EIGEN_STRONG_INLINE Packet2d pcast(const Packet2ul& a) { return vcvtq_f64_u64(a); } -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2l& a) { - return vreinterpretq_f64_s64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet2ul& a) { - return vreinterpretq_f64_u64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2l preinterpret(const Packet2d& a) { - return vreinterpretq_s64_f64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2ul preinterpret(const Packet2d& a) { - return vreinterpretq_u64_f64(a); -} -template <> -EIGEN_STRONG_INLINE Packet2d preinterpret(const Packet4i& a) { - return vreinterpretq_f64_s32(a); -} -template <> -EIGEN_STRONG_INLINE Packet4i preinterpret(const Packet2d& a) { - return vreinterpretq_s32_f64(a); -} - #endif // EIGEN_ARCH_ARM64 } // end namespace internal diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h index b1edfa4b23f3acc33c968295aaddd7daa07c9554..215bfd7bb6a294690e9e1a9e640d47c7126619a3 100644 --- a/Eigen/src/Core/arch/SSE/Complex.h +++ b/Eigen/src/Core/arch/SSE/Complex.h @@ -19,7 +19,7 @@ struct Packet2cf { EIGEN_STRONG_INLINE Packet2cf() {} EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {} - __m128 v; + Packet4f v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going @@ -66,12 +66,6 @@ template<> struct unpacket_traits { template<> EIGEN_STRONG_INLINE Packet2cf padd(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); } template<> EIGEN_STRONG_INLINE Packet2cf psub(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); } -template<> EIGEN_STRONG_INLINE Packet2cf pxor(const Packet2cf& a, const Packet2cf& b); -template<> EIGEN_STRONG_INLINE Packet2cf paddsub(const Packet2cf& a, const Packet2cf& b) -{ - const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x0,0x0)); - return Packet2cf(padd(a.v, pxor(mask, b.v))); -} template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { @@ -112,14 +106,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu(const std::complex EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { - Packet2cf res; -#ifdef EIGEN_VECTORIZE_SSE3 - res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast(&from))); -#else - res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast(&from))); - res.v = _mm_movelh_ps(res.v, res.v); -#endif - return res; + const float re = std::real(from); + const float im = std::imag(from); + return Packet2cf(_mm_set_ps(im, re, im, re)); } template<> EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* from) { return pset1(*from); } @@ -171,74 +160,21 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v)))); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), - _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask))); - #endif - } -}; - -template<> struct conj_helper +EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) { - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000)); - return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask), - _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), - vec4f_swizzle1(b.v, 1, 0, 3, 2)))); - #endif - } -}; + return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); +} EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for SSE3 and 4 - Packet2cf res = conj_helper().pmul(a,b); + Packet2cf res = pmul(a, pconj(b)); __m128 s = _mm_mul_ps(b.v,b.v); - return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1))))); + return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2)))); } -EIGEN_STRONG_INLINE Packet2cf pcplxflip/* */(const Packet2cf& x) -{ - return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2)); -} //---------- double ---------- @@ -246,7 +182,7 @@ struct Packet1cd { EIGEN_STRONG_INLINE Packet1cd() {} EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {} - __m128d v; + Packet2d v; }; // Use the packet_traits defined in AVX/PacketMath.h instead if we're going @@ -354,66 +290,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const return pfirst(a); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(a, pconj(b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return internal::pmul(pconj(a), b); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), - _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)), mask))); - #endif - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - #ifdef EIGEN_VECTORIZE_SSE3 - return pconj(internal::pmul(a, b)); - #else - const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0)); - return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask), - _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1), - vec2d_swizzle1(b.v, 1, 0)))); - #endif - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for SSE3 and 4 - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); __m128d s = _mm_mul_pd(b.v,b.v); return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1)))); } diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h index db102c73a2376887332d2b25f6894bfa86284653..b485e0df12cfa164b254bd9953702edf79915908 100755 --- a/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/Eigen/src/Core/arch/SSE/PacketMath.h @@ -444,7 +444,7 @@ template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packe template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); } template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -463,7 +463,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_min_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -494,7 +494,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_ps, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, @@ -513,7 +513,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const #endif } template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { -#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63 +#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63 // There appears to be a bug in GCC, by which the optimizer may // flip the argument order in calls to _mm_max_pd, so we have to // resort to inline ASM here. This is supposed to be fixed in gcc6.3, diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h index 4877b6d8090cdd0ae4ebf8260c3d50c1242aab70..9060b372ff57ec80bce35b9a763715a1ba79b60a 100644 --- a/Eigen/src/Core/arch/SVE/PacketMath.h +++ b/Eigen/src/Core/arch/SVE/PacketMath.h @@ -22,10 +22,6 @@ namespace internal #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 template diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h index ddf5a97d86b938aff973c0e5db1a23d51c7c6206..6c67cfe058428be532b2ef2c34195235865f58eb 100644 --- a/Eigen/src/Core/arch/ZVector/Complex.h +++ b/Eigen/src/Core/arch/ZVector/Complex.h @@ -91,8 +91,18 @@ template<> struct packet_traits > : default_packet_traits }; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; }; -template<> struct unpacket_traits { typedef std::complex type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; }; +template<> struct unpacket_traits { + typedef std::complex type; + enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet2cf half; + typedef Packet4f as_real; +}; +template<> struct unpacket_traits { + typedef std::complex type; + enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; + typedef Packet1cd half; + typedef Packet2d as_real; +}; /* Forward declaration */ EIGEN_STRONG_INLINE void ptranspose(PacketBlock& kernel); @@ -150,7 +160,7 @@ template<> EIGEN_STRONG_INLINE void prefetch >(const std::c template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { - std::complex EIGEN_ALIGN16 res; + EIGEN_ALIGN16 std::complex res; pstore >(&res, a); return res; @@ -165,45 +175,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const { return pfirst(a); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec - Packet1cd res = conj_helper().pmul(a,b); + Packet1cd res = pmul(a,pconj(b)); Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_); return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64))); } @@ -228,7 +205,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { - std::complex EIGEN_ALIGN16 res[2]; + EIGEN_ALIGN16 std::complex res[2]; pstore >(res, a); return res[0]; @@ -258,14 +235,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex EIGEN_DEVICE_FUNC inline Packet2cf pgather, Packet2cf>(const std::complex* from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); } template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::complex* to, const Packet2cf& from, Index stride) { - std::complex EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 std::complex af[2]; pstore >((std::complex *) af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -337,39 +314,6 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return res; } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) @@ -456,45 +400,12 @@ template<> EIGEN_STRONG_INLINE std::complex predux_mul(const P return pfirst(prod); } -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(a, pconj(b)); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return internal::pmul(pconj(a), b); - } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(pmul(x,y),c); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const - { - return pconj(internal::pmul(a, b)); - } -}; - EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for AltiVec - Packet2cf res = conj_helper().pmul(a, b); + Packet2cf res = pmul(a, pconj(b)); Packet4f s = pmul(b.v, b.v); return Packet2cf(pdiv(res.v, padd(s, vec_perm(s, s, p16uc_COMPLEX32_REV)))); } diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h index b10c1f6c7e1977c339bcd2cae960077623635c6e..a7b59c80edfc5f0b468877139af36055c974b85d 100755 --- a/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -22,10 +22,6 @@ namespace internal { #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif -#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD -#endif - #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif @@ -94,8 +90,9 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0); static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1); -static Packet2d p2d_ONE = { 1.0, 1.0 }; -static Packet2d p2d_ZERO_ = { -0.0, -0.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ZERO_ = { numext::bit_cast(0x8000000000000000ull), + numext::bit_cast(0x8000000000000000ull) }; #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12) #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ @@ -361,7 +358,7 @@ pbroadcast4(const double *a, template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -371,7 +368,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* f template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const double* from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; af[0] = from[0*stride]; af[1] = from[1*stride]; return pload(af); @@ -379,7 +376,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather(const dou template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) { - int EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 int ai[4]; pstore((int *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -389,7 +386,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const template<> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2d& from, Index stride) { - double EIGEN_ALIGN16 af[2]; + EIGEN_ALIGN16 double af[2]; pstore(af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -463,8 +460,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2d& template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { EIGEN_ALIGN16 int x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE double pfirst(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { @@ -642,7 +639,7 @@ pbroadcast4(const float *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 float ai[4]; ai[0] = from[0*stride]; ai[1] = from[1*stride]; ai[2] = from[2*stride]; @@ -652,7 +649,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 ai[4]; + EIGEN_ALIGN16 float ai[4]; pstore((float *)ai, from); to[0*stride] = ai[0]; to[1*stride] = ai[1]; @@ -788,7 +785,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) return p; } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { @@ -946,7 +943,7 @@ pbroadcast4(const float *a, template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; af[0] = from[0*stride]; af[1] = from[1*stride]; af[2] = from[2*stride]; @@ -956,7 +953,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4f& from, Index stride) { - float EIGEN_ALIGN16 af[4]; + EIGEN_ALIGN16 float af[4]; pstore((float*)af, from); to[0*stride] = af[0]; to[1*stride] = af[1]; @@ -981,7 +978,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround (const Packet4f& a) { r template<> EIGEN_STRONG_INLINE Packet4f pceil (const Packet4f& a) { return vec_ceil(a); } template<> EIGEN_STRONG_INLINE Packet4f pfloor (const Packet4f& a) { return vec_floor(a); } template<> EIGEN_STRONG_INLINE Packet4f pabs (const Packet4f& a) { return vec_abs(a); } -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h index a182b4b748a815fb75ff71780ea4f66e5f80dd19..63f09ab9317e31d26525916d8ca0a031fa7c1247 100644 --- a/Eigen/src/Core/functors/BinaryFunctors.h +++ b/Eigen/src/Core/functors/BinaryFunctors.h @@ -50,7 +50,7 @@ struct scalar_sum_op : binary_op_base template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, // rough estimate! + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasAdd && packet_traits::HasAdd // TODO vectorize mixed sum }; @@ -88,7 +88,7 @@ struct scalar_product_op : binary_op_base template struct functor_traits > { enum { - Cost = (NumTraits::MulCost + NumTraits::MulCost)/2, // rough estimate! + Cost = (int(NumTraits::MulCost) + int(NumTraits::MulCost))/2, // rough estimate! PacketAccess = is_same::value && packet_traits::HasMul && packet_traits::HasMul // TODO vectorize mixed product }; @@ -364,7 +364,7 @@ struct scalar_difference_op : binary_op_base template struct functor_traits > { enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, + Cost = (int(NumTraits::AddCost) + int(NumTraits::AddCost)) / 2, PacketAccess = is_same::value && packet_traits::HasSub && packet_traits::HasSub }; }; diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h index d2e7b5b032228c53fe6129c6f299e522f6440d33..4570c9b634bd920753aa4d9ab063c4ab49ab4ecd 100644 --- a/Eigen/src/Core/functors/StlFunctors.h +++ b/Eigen/src/Core/functors/StlFunctors.h @@ -12,6 +12,28 @@ namespace Eigen { +// Portable replacements for certain functors. +namespace numext { + +template +struct equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs == rhs; + } +}; + +template +struct not_equal_to { + typedef bool result_type; + EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const { + return lhs != rhs; + } +}; + +} + + namespace internal { // default functor traits for STL functors: @@ -68,10 +90,18 @@ template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; +template +struct functor_traits > + : functor_traits > {}; + template struct functor_traits > { enum { Cost = 1, PacketAccess = false }; }; +template +struct functor_traits > + : functor_traits > {}; + #if (EIGEN_COMP_CXXVER < 11) // std::binder* are deprecated since c++11 and will be removed in c++17 template diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h index c98fa573ceeb31e547edc5574eb3753fc40900a2..16136d185aa3c16cd7b7cff1fa97ca07c405ddf0 100644 --- a/Eigen/src/Core/functors/UnaryFunctors.h +++ b/Eigen/src/Core/functors/UnaryFunctors.h @@ -109,7 +109,7 @@ struct functor_traits > template struct scalar_conjugate_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op) EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); } + EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); } }; @@ -138,7 +138,7 @@ struct functor_traits > template struct scalar_arg_op { EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op) typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::parg(a); } diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 79367f19742a4bbb61168ea2c5598cac8e05bc2b..4c649a281df12aae096f1e20404fdda36b4ec87a 100644 --- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -44,7 +44,7 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE) #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE) -#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_SET_DEFAULT_L3_CACHE_SIZE +#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE #else #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE) @@ -349,36 +349,6 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_ computeProductBlockingSizes(k, m, n, num_threads); } -#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD - #define CJMADD(CJ,A,B,C,T) C = CJ.pmadd(A,B,C); -#else - - // FIXME (a bit overkill maybe ?) - - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/) - { - c = cj.pmadd(a,b,c); - } - }; - - template struct gebp_madd_selector { - EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t) - { - t = b; t = cj.pmul(a,t); c = padd(c,t); - } - }; - - template - EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t) - { - gebp_madd_selector::run(cj,a,b,c,t); - } - - #define CJMADD(CJ,A,B,C,T) gebp_madd(CJ,A,B,C,T); -// #define CJMADD(CJ,A,B,C,T) T = B; T = CJ.pmul(A,T); C = padd(C,T); -#endif - template struct RhsPanelHelper { private: @@ -1673,8 +1643,8 @@ void gebp_kernel0) { Index remaining_rows = rows-i; @@ -2322,21 +2290,21 @@ EIGEN_DONT_INLINE void gemm_pack_lhs kernel; - for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p])); } else if (HasHalf && psize == HalfPacketSize) { gone_half = true; PacketBlock kernel_half; - for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel_half); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p])); } else if (HasQuarter && psize == QuarterPacketSize) { gone_quarter = true; PacketBlock kernel_quarter; - for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); + for (Index p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket(i+p+m, k); ptranspose(kernel_quarter); - for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); + for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p])); } } count += psize*pack; diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h index caa65fcccf56cfbc0de60ae65c98e6975fbb89ea..73ddd260eb6f97ec570fec3cca3663e38d1ef886 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -59,9 +59,9 @@ typedef gebp_traits Traits; typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; static void run(Index rows, Index cols, Index depth, - const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resIncr, Index resStride, + const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsStride, + ResScalar* res_, Index resIncr, Index resStride, ResScalar alpha, level3_blocking& blocking, GemmParallelInfo* info = 0) @@ -69,9 +69,9 @@ static void run(Index rows, Index cols, Index depth, typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs, lhsStride); - RhsMapper rhs(_rhs, rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_, lhsStride); + RhsMapper rhs(rhs_, rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 6ba0d9bdb8d1752af714b5c7e326af6bb5e9a8d8..c0b5d8050ddbf288f2dcc9142eaa4717476de2a1 100644 --- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -60,9 +60,9 @@ template { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; - static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resIncr, Index resStride, + static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsStride, + ResScalar* res_, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking& blocking) { typedef gebp_traits Traits; @@ -70,9 +70,9 @@ struct general_matrix_matrix_triangular_product LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); Index mc = (std::min)(size,blocking.mc()); @@ -113,7 +113,7 @@ struct general_matrix_matrix_triangular_product::ret }; - void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { typedef blas_data_mapper ResMapper; typedef blas_data_mapper BufferMapper; - ResMapper res(_res, resStride, resIncr); + ResMapper res(res_, resStride, resIncr); gebp_kernel gebp_kernel1; gebp_kernel gebp_kernel2; diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index dfb6aebced46a246820c6f68efd15a400bfd30e2..974a047053f85cbc248ca0b012f20554e0a63bc5 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -359,6 +359,11 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product::type UnsignedIndex; + const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize); + const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf); + const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter); + Index i=0; for(; i(ResScalar(0)), c7 = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); @@ -393,7 +397,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)), c3 = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); @@ -436,7 +440,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)), c1 = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); @@ -465,7 +469,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(ResScalar(0)); ResPacketHalf c0_h = pset1(ResScalar(0)); ResPacketQuarter c0_q = pset1(ResScalar(0)); - Index j=0; - for(; j+LhsPacketSize<=cols; j+=LhsPacketSize) + + for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) { RhsPacket b0 = rhs.template load(j,0); c0 = pcj.pmadd(lhs.template load(i,j),b0,c0); } ResScalar cc0 = predux(c0); if (HasHalf) { - for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf) + for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf) { RhsPacketHalf b0 = rhs.template load(j,0); c0_h = pcj_half.pmadd(lhs.template load(i,j),b0,c0_h); @@ -496,14 +501,14 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product(j,0); c0_q = pcj_quarter.pmadd(lhs.template load(i,j),b0,c0_q); } cc0 += predux(c0_q); } - for(; j::type>::half HalfPacket; typedef typename unpacket_traits::type>::half>::half QuarterPacket; @@ -53,7 +53,7 @@ struct symm_pack_lhs HasHalf = (int)HalfPacketSize < (int)PacketSize, HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize}; - const_blas_data_mapper lhs(_lhs,lhsStride); + const_blas_data_mapper lhs(lhs_,lhsStride); Index count = 0; //Index peeled_mc3 = (rows/Pack1)*Pack1; @@ -101,11 +101,11 @@ template struct symm_pack_rhs { enum { PacketSize = packet_traits::size }; - void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2) + void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2) { Index end_k = k2 + rows; Index count = 0; - const_blas_data_mapper rhs(_rhs,rhsStride); + const_blas_data_mapper rhs(rhs_,rhsStride); Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0; Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0; @@ -330,8 +330,8 @@ struct product_selfadjoint_matrix& blocking); }; @@ -342,9 +342,9 @@ template EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = rows; @@ -355,10 +355,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix LhsTransposeMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - LhsTransposeMapper lhs_transpose(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + LhsTransposeMapper lhs_transpose(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -425,8 +425,8 @@ struct product_selfadjoint_matrix& blocking); }; @@ -437,9 +437,9 @@ template EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = cols; @@ -448,8 +448,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix LhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - ResMapper res(_res,resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + ResMapper res(res_,resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -466,7 +466,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix GEPP for(Index i2=0; i2& SelfAdjointView::type>::type UType; - typedef typename internal::remove_all::type>::type VType; + typedef typename internal::remove_all::type>::type UType; + typedef typename internal::remove_all::type>::type VType; internal::selfadjoint_rank2_update_selector ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha); diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h index f0c60507ab10100e02933e253d25a03ff99256d7..ba605a1c2ac45751748f40b9804b709c19fba3cf 100644 --- a/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -18,10 +18,10 @@ namespace internal { // struct gemm_pack_lhs_triangular // { // Matrix::IsComplex && Conjugate> cj; -// const_blas_data_mapper lhs(_lhs,lhsStride); +// const_blas_data_mapper lhs(lhs_,lhsStride); // int count = 0; // const int peeled_mc = (rows/mr)*mr; // for(int i=0; i& blocking); }; @@ -110,9 +110,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros @@ -124,9 +124,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -254,8 +254,8 @@ struct product_triangular_matrix_matrix& blocking); }; @@ -268,9 +268,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( Index _rows, Index _cols, Index _depth, - const Scalar* _lhs, Index lhsStride, - const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resIncr, Index resStride, + const Scalar* lhs_, Index lhsStride, + const Scalar* rhs_, Index rhsStride, + Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { const Index PacketBytes = packet_traits::size*sizeof(Scalar); @@ -283,9 +283,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride, resIncr); + LhsMapper lhs(lhs_,lhsStride); + RhsMapper rhs(rhs_,rhsStride); + ResMapper res(res_, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h index 76bfa159cedc5db03aafe8c798024a762b483c47..0a25748e1b13788fa9844c342716d6d886a21d1c 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector.h +++ b/Eigen/src/Core/products/TriangularMatrixVector.h @@ -26,30 +26,30 @@ struct triangular_matrix_vector_product EIGEN_DONT_INLINE void triangular_matrix_vector_product - ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha) + ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const RhsScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; - Index size = (std::min)(_rows,_cols); - Index rows = IsLower ? _rows : (std::min)(_rows,_cols); - Index cols = IsLower ? (std::min)(_rows,_cols) : _cols; + Index size = (std::min)(rows_,cols_); + Index rows = IsLower ? rows_ : (std::min)(rows_,cols_); + Index cols = IsLower ? (std::min)(rows_,cols_) : cols_; typedef Map, 0, OuterStride<> > LhsMap; - const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if::type cjLhs(lhs); typedef Map, 0, InnerStride<> > RhsMap; - const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr)); + const RhsMap rhs(rhs_,cols,InnerStride<>(rhsIncr)); typename conj_expr_if::type cjRhs(rhs); typedef Map > ResMap; - ResMap res(_res,rows); + ResMap res(res_,rows); typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; @@ -84,7 +84,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product EIGEN_DONT_INLINE void triangular_matrix_vector_product - ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha) + ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride, + const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const ResScalar& alpha) { static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH; - Index diagSize = (std::min)(_rows,_cols); - Index rows = IsLower ? _rows : diagSize; - Index cols = IsLower ? diagSize : _cols; + Index diagSize = (std::min)(rows_,cols_); + Index rows = IsLower ? rows_ : diagSize; + Index cols = IsLower ? diagSize : cols_; typedef Map, 0, OuterStride<> > LhsMap; - const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride)); + const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride)); typename conj_expr_if::type cjLhs(lhs); typedef Map > RhsMap; - const RhsMap rhs(_rhs,cols); + const RhsMap rhs(rhs_,cols); typename conj_expr_if::type cjRhs(rhs); typedef Map, 0, InnerStride<> > ResMap; - ResMap res(_res,rows,InnerStride<>(resIncr)); + ResMap res(res_,rows,InnerStride<>(resIncr)); typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 3d47a2b94cbc22de5a5295c38e9ec5bdbf353930..0f8d3a1dac5a7fa406858151e759586d0224f0cb 100644 --- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -50,18 +50,18 @@ struct triangular_matrix_vector_product_trmv : #define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \ template \ struct triangular_matrix_vector_product { \ - static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \ + const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \ triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ } \ }; \ template \ struct triangular_matrix_vector_product { \ - static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \ - const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \ + static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \ + const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \ triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ } \ }; @@ -81,23 +81,23 @@ struct triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ return; \ }\ - Index size = (std::min)(_rows,_cols); \ - Index rows = IsLower ? _rows : size; \ - Index cols = IsLower ? size : _cols; \ + Index size = (std::min)(rows_,cols_); \ + Index rows = IsLower ? rows_ : size; \ + Index cols = IsLower ? size : cols_; \ \ typedef VectorX##EIGPREFIX VectorRhs; \ EIGTYPE *x, *y;\ \ /* Set x*/ \ - Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + Map > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \ VectorRhs x_tmp; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ @@ -121,24 +121,24 @@ struct triangular_matrix_vector_product_trmv(rows-size); \ n = convert_index(size); \ } \ else { \ x += size; \ - y = _res; \ - a = _lhs + size*lda; \ + y = res_; \ + a = lhs_ + size*lda; \ m = convert_index(size); \ n = convert_index(cols-size); \ } \ @@ -170,23 +170,23 @@ struct triangular_matrix_vector_product_trmv::run( \ - _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \ + rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \ return; \ }\ - Index size = (std::min)(_rows,_cols); \ - Index rows = IsLower ? _rows : size; \ - Index cols = IsLower ? size : _cols; \ + Index size = (std::min)(rows_,cols_); \ + Index rows = IsLower ? rows_ : size; \ + Index cols = IsLower ? size : cols_; \ \ typedef VectorX##EIGPREFIX VectorRhs; \ EIGTYPE *x, *y;\ \ /* Set x*/ \ - Map > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \ + Map > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \ VectorRhs x_tmp; \ if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \ x = x_tmp.data(); \ @@ -210,24 +210,24 @@ struct triangular_matrix_vector_product_trmv(rows-size); \ n = convert_index(size); \ } \ else { \ x += size; \ - y = _res; \ - a = _lhs + size; \ + y = res_; \ + a = lhs_ + size; \ m = convert_index(size); \ n = convert_index(cols-size); \ } \ diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h index c5161022cd29626ef2e828db3c195f4e3dc07e6c..e16a564980726ca200dcaf51e15adc72f89876d5 100755 --- a/Eigen/src/Core/util/BlasUtil.h +++ b/Eigen/src/Core/util/BlasUtil.h @@ -39,90 +39,6 @@ template struct general_matrix_vector_product; - -template struct conj_if; - -template<> struct conj_if { - template - inline T operator()(const T& x) const { return numext::conj(x); } - template - inline T pconj(const T& x) const { return internal::pconj(x); } -}; - -template<> struct conj_if { - template - inline const T& operator()(const T& x) const { return x; } - template - inline const T& pconj(const T& x) const { return x; } -}; - -// Generic implementation for custom complex types. -template -struct conj_helper -{ - typedef typename ScalarBinaryOpTraits::ReturnType Scalar; - - EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const - { return conj_if()(x) * conj_if()(y); } -}; - -template struct conj_helper -{ - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); } -}; - -template struct conj_helper, std::complex, false,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); } -}; - -template struct conj_helper, std::complex, true,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, std::complex, true,true> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const - { return c + pmul(x,y); } - - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const - { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); } -}; - -template struct conj_helper, RealScalar, Conj,false> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const - { return conj_if()(x)*y; } -}; - -template struct conj_helper, false,Conj> -{ - typedef std::complex Scalar; - EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const - { return x*conj_if()(y); } -}; - template struct get_factor { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); } }; @@ -602,7 +518,7 @@ struct blas_traits template::HasUsableDirectAccess> struct extract_data_selector { - static const typename T::Scalar* run(const T& m) + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) { return blas_traits::extract(m).data(); } @@ -613,7 +529,8 @@ struct extract_data_selector { static typename T::Scalar* run(const T&) { return 0; } }; -template const typename T::Scalar* extract_data(const T& m) +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) { return extract_data_selector::run(m); } diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index af4e69623830aece11ec266155ce0a878cdb204e..259ef0ca18ce702629474d6fc72ef11c77ffecb1 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -339,7 +339,7 @@ extern "C" { // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly. // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus: - #if EIGEN_COMP_ICC >= 1110 + #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN #include #else #include @@ -438,13 +438,15 @@ #include #endif -#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380)) +#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380)) // We can use the optimized fp16 to float and float to fp16 conversion routines #define EIGEN_HAS_FP16_C - #if defined(EIGEN_COMP_CLANG) - // Workaround for clang: The FP16C intrinsics for clang are included by - // immintrin.h, as opposed to emmintrin.h as suggested by Intel: + #if EIGEN_COMP_GNUC + // Make sure immintrin.h is included, even if e.g. vectorization is + // explicitly disabled (see also issue #2395). + // Note that FP16C intrinsics for gcc and clang are included by immintrin.h, + // as opposed to emmintrin.h as suggested by Intel: // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711 #include #endif diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index f7f907ab742766dcc8c6715fea4a23dbfe25081a..0667b1c44e1dc25aab8b9702dc5a04dc954be87c 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -134,7 +134,7 @@ const unsigned int LinearAccessBit = 0x10; * Means the expression has a coeffRef() method, i.e. is writable as its individual coefficients are directly addressable. * This rules out read-only expressions. * - * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but note + * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but not * the other: * \li writable expressions that don't have a very simple memory layout as a strided array, have LvalueBit but not DirectAccessBit * \li Map-to-const expressions, for example Map, have DirectAccessBit but not LvalueBit @@ -157,7 +157,7 @@ const unsigned int DirectAccessBit = 0x40; /** \deprecated \ingroup flags * * means the first coefficient packet is guaranteed to be aligned. - * An expression cannot has the AlignedBit without the PacketAccessBit flag. + * An expression cannot have the AlignedBit without the PacketAccessBit flag. * In other words, this means we are allow to perform an aligned packet access to the first element regardless * of the expression kind: * \code diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h index fe0cfec0bc2461ac44abca8f3d05b468d3c60fd9..0865fb698d593b57ebfaa6aa838beccca5e4007a 100755 --- a/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/Eigen/src/Core/util/DisableStupidWarnings.h @@ -1,9 +1,10 @@ #ifndef EIGEN_WARNINGS_DISABLED #define EIGEN_WARNINGS_DISABLED -#ifdef _MSC_VER +#if defined(_MSC_VER) // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p)) // 4101 - unreferenced local variable + // 4127 - conditional expression is constant // 4181 - qualifier applied to reference type ignored // 4211 - nonstandard extension used : redefined extern to static // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data @@ -19,7 +20,7 @@ #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma warning( push ) #endif - #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) + #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800) #elif defined __INTEL_COMPILER // 2196 - routine is both "inline" and "noinline" ("noinline" assumed) @@ -35,25 +36,28 @@ #pragma warning disable 2196 279 1684 2259 #elif defined __clang__ - // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant - // this is really a stupid warning as it warns on compile-time expressions involving enums #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS #pragma clang diagnostic push #endif - #pragma clang diagnostic ignored "-Wconstant-logical-operand" - #if __clang_major__ >= 3 && __clang_minor__ >= 5 - #pragma clang diagnostic ignored "-Wabsolute-value" - #endif - #if __clang_major__ >= 10 - #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" - #endif - #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L - // warning: generic selections are a C11-specific feature - // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h - #pragma clang diagnostic ignored "-Wc11-extensions" + #if defined(__has_warning) + // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant + // this is really a stupid warning as it warns on compile-time expressions involving enums + #if __has_warning("-Wconstant-logical-operand") + #pragma clang diagnostic ignored "-Wconstant-logical-operand" + #endif + #if __has_warning("-Wimplicit-int-float-conversion") + #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion" + #endif + #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L + // warning: generic selections are a C11-specific feature + // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h + #if __has_warning("-Wc11-extensions") + #pragma clang diagnostic ignored "-Wc11-extensions" + #endif + #endif #endif -#elif defined __GNUC__ +#elif defined __GNUC__ && !defined(__FUJITSU) #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic push @@ -74,25 +78,53 @@ #endif #if defined __NVCC__ - #pragma diag_suppress boolean_controlling_expr_is_constant + // MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so + // we instead use Microsoft's __pragma extension. + #if defined _MSC_VER + #define EIGEN_MAKE_PRAGMA(X) __pragma(#X) + #else + #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X) + #endif + #if defined __NVCC_DIAG_PRAGMA_SUPPORT__ + #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X) + #else + #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X) + #endif + + EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant) // Disable the "statement is unreachable" message - #pragma diag_suppress code_is_unreachable + EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable) // Disable the "dynamic initialization in unreachable code" message - #pragma diag_suppress initialization_not_reachable + EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable) // Disable the "invalid error number" message that we get with older versions of nvcc - #pragma diag_suppress 1222 + EIGEN_NV_DIAG_SUPPRESS(1222) // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler) - #pragma diag_suppress 2527 - #pragma diag_suppress 2529 - #pragma diag_suppress 2651 - #pragma diag_suppress 2653 - #pragma diag_suppress 2668 - #pragma diag_suppress 2669 - #pragma diag_suppress 2670 - #pragma diag_suppress 2671 - #pragma diag_suppress 2735 - #pragma diag_suppress 2737 - #pragma diag_suppress 2739 + EIGEN_NV_DIAG_SUPPRESS(2527) + EIGEN_NV_DIAG_SUPPRESS(2529) + EIGEN_NV_DIAG_SUPPRESS(2651) + EIGEN_NV_DIAG_SUPPRESS(2653) + EIGEN_NV_DIAG_SUPPRESS(2668) + EIGEN_NV_DIAG_SUPPRESS(2669) + EIGEN_NV_DIAG_SUPPRESS(2670) + EIGEN_NV_DIAG_SUPPRESS(2671) + EIGEN_NV_DIAG_SUPPRESS(2735) + EIGEN_NV_DIAG_SUPPRESS(2737) + EIGEN_NV_DIAG_SUPPRESS(2739) + EIGEN_NV_DIAG_SUPPRESS(2885) + EIGEN_NV_DIAG_SUPPRESS(2888) + EIGEN_NV_DIAG_SUPPRESS(2976) + EIGEN_NV_DIAG_SUPPRESS(2979) + EIGEN_NV_DIAG_SUPPRESS(20011) + EIGEN_NV_DIAG_SUPPRESS(20014) + // Disable the "// __device__ annotation is ignored on a function(...) that is + // explicitly defaulted on its first declaration" message. + // The __device__ annotation seems to actually be needed in some cases, + // otherwise resulting in kernel runtime errors. + EIGEN_NV_DIAG_SUPPRESS(2886) + EIGEN_NV_DIAG_SUPPRESS(2977) + EIGEN_NV_DIAG_SUPPRESS(20012) + #undef EIGEN_NV_DIAG_SUPPRESS + #undef EIGEN_MAKE_PRAGMA #endif #else diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h index ef3fdfb9489f7ab2a6982c50c567429a8bcb4d26..e0092f654d9ca5138d475fc8f3a2a51c23e677b7 100644 --- a/Eigen/src/Core/util/IntegralConstant.h +++ b/Eigen/src/Core/util/IntegralConstant.h @@ -77,7 +77,7 @@ public: template FixedInt operator&( FixedInt) const { return FixedInt(); } -#if EIGEN_HAS_CXX14 +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES // Needed in C++14 to allow fix(): FixedInt operator() () const { return *this; } @@ -138,7 +138,7 @@ template struct get_fixed_value,Default> { static const int value = N; }; -#if !EIGEN_HAS_CXX14 +#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES template struct get_fixed_value (*)(),Default> { static const int value = N; }; @@ -154,7 +154,7 @@ struct get_fixed_value,Default> { }; template EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; } -#if !EIGEN_HAS_CXX14 +#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES template EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt (*)()) { return N; } #endif @@ -166,7 +166,7 @@ template struct clea // Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index template struct cleanup_index_type::value>::type> { typedef Index type; }; -#if !EIGEN_HAS_CXX14 +#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES // In c++98/c++11, fix is a pointer to function that we better cleanup to a true FixedInt: template struct cleanup_index_type (*)(), DynamicKey> { typedef FixedInt type; }; #endif @@ -184,7 +184,7 @@ template struct cleanup_index_type static const internal::FixedInt fix{}; #else diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index d2e84492834178ff4f71d49199700cc2069c9955..cd2dbe77a9f62f704dfa1513ee7391597d12dcca 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -16,8 +16,8 @@ //------------------------------------------------------------------------------------------ #define EIGEN_WORLD_VERSION 3 -#define EIGEN_MAJOR_VERSION 3 -#define EIGEN_MINOR_VERSION 90 +#define EIGEN_MAJOR_VERSION 4 +#define EIGEN_MINOR_VERSION 1 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ @@ -162,8 +162,8 @@ /// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++ // XLC version -// 3.1 0x0301 -// 4.5 0x0405 +// 3.1 0x0301 +// 4.5 0x0405 // 5.0 0x0500 // 12.1 0x0C01 #if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__) @@ -275,7 +275,7 @@ /// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE /// compliant Arm fp16 type -#if EIGEN_ARCH_ARM64 +#if EIGEN_ARCH_ARM_OR_ARM64 #ifndef EIGEN_HAS_ARM64_FP16 #if defined(__ARM_FP16_FORMAT_IEEE) #define EIGEN_HAS_ARM64_FP16 1 @@ -285,28 +285,6 @@ #endif #endif -/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture -/// supports Neon vector intrinsics for fp16. -#if EIGEN_ARCH_ARM64 - #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC - #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 - #else - #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0 - #endif - #endif -#endif - -/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture -/// supports Neon scalar intrinsics for fp16. -#if EIGEN_ARCH_ARM64 - #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC - #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) - #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 - #endif - #endif -#endif - /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS #if defined(__mips__) || defined(__mips) #define EIGEN_ARCH_MIPS 1 @@ -565,6 +543,32 @@ // #endif +/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture +/// supports Neon vector intrinsics for fp16. +#if EIGEN_ARCH_ARM_OR_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC + // Clang only supports FP16 on aarch64, and not all intrinsics are available + // on A32 anyways even in GCC (e.g. vdiv_f16, vsqrt_f16). + #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1 + #else + #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0 + #endif + #endif +#endif + +/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture +/// supports Neon scalar intrinsics for fp16. +#if EIGEN_ARCH_ARM_OR_ARM64 + #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC + // Clang only supports FP16 on aarch64, and not all intrinsics are available + // on A32 anyways, even in GCC (e.g. vceqh_f16). + #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE) + #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1 + #endif + #endif +#endif + #if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__) // EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro. // In most cases we want to check if both macros are defined which can be done using the define below. @@ -637,6 +641,14 @@ #define EIGEN_COMP_CXXVER 03 #endif +#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES + #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14 + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1 + #else + #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0 + #endif +#endif + // The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features // but in practice we should not rely on them but rather on the availabilty of @@ -833,7 +845,7 @@ #endif #endif -// NOTE: the required Apple's clang version is very conservative +// NOTE: the required Apple's clang version is very conservative // and it could be that XCode 9 works just fine. // NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support // and not tested. @@ -962,7 +974,7 @@ #endif #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline)) // All functions callable from CUDA/HIP code must be qualified with __device__ -#elif defined(EIGEN_GPUCC) +#elif defined(EIGEN_GPUCC) #define EIGEN_DEVICE_FUNC __host__ __device__ #else #define EIGEN_DEVICE_FUNC @@ -989,7 +1001,7 @@ #else #define eigen_plain_assert(x) #endif -#else +#else #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO namespace Eigen { namespace internal { @@ -1123,7 +1135,16 @@ namespace Eigen { #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,v,wa" (X)); #elif EIGEN_ARCH_ARM_OR_ARM64 // General, NEON. - #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + // Clang doesn't like "r", + // error: non-trivial scalar-to-vector conversion, possible invalid + // constraint for vector type + // GCC < 5 doesn't like "g", + // error: 'asm' operand requires impossible reload + #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(5, 0) + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+r,w" (X)); + #else + #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,w" (X)); + #endif #elif EIGEN_ARCH_i386_OR_x86_64 // General, SSE. #define EIGEN_OPTIMIZATION_BARRIER(X) __asm__ ("" : "+g,x" (X)); @@ -1177,8 +1198,12 @@ namespace Eigen { #define EIGEN_USING_STD(FUNC) using std::FUNC; #endif -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_COMP_NVCC) - // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1916 || (EIGEN_COMP_MSVC == 1916 && EIGEN_COMP_NVCC)) + // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary, + // otherwise we get duplicate definition errors + // For later MSVC versions, we require explicit operator= definition, otherwise we get + // use of implicitly deleted operator errors. + // (cf Bugs 920, 1000, 1324, 2291) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) @@ -1204,7 +1229,7 @@ namespace Eigen { * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden. */ #if EIGEN_HAS_CXX11 -#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default; +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default; #else #define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) #endif @@ -1229,12 +1254,12 @@ namespace Eigen { */ #if EIGEN_HAS_CXX11 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - Derived() = default; \ - ~Derived() = default; + EIGEN_DEVICE_FUNC Derived() = default; \ + EIGEN_DEVICE_FUNC ~Derived() = default; #else #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ - Derived() {}; \ - /* ~Derived() {}; */ + EIGEN_DEVICE_FUNC Derived() {}; \ + /* EIGEN_DEVICE_FUNC ~Derived() {}; */ #endif diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 7cbe8a672ab310e2a996d5cb1f19f820f329889d..3aea7df51ac79952f6319a5931e8ac534b632307 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -292,20 +292,59 @@ template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size) +template EIGEN_DEVICE_FUNC inline T* default_construct_elements_of_array(T *ptr, std::size_t size) { - std::size_t i; + std::size_t i=0; EIGEN_TRY { - for (i = 0; i < size; ++i) ::new (ptr + i) T; - return ptr; + for (i = 0; i < size; ++i) ::new (ptr + i) T; } EIGEN_CATCH(...) { destruct_elements_of_array(ptr, i); EIGEN_THROW; } - return NULL; + return ptr; +} + +/** \internal Copy-constructs the elements of an array. + * The \a size parameter tells on how many objects to copy. + */ +template EIGEN_DEVICE_FUNC inline T* copy_construct_elements_of_array(T *ptr, const T* src, std::size_t size) +{ + std::size_t i=0; + EIGEN_TRY + { + for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i)); + } + EIGEN_CATCH(...) + { + destruct_elements_of_array(ptr, i); + EIGEN_THROW; + } + return ptr; +} + +/** \internal Move-constructs the elements of an array. + * The \a size parameter tells on how many objects to move. + */ +template EIGEN_DEVICE_FUNC inline T* move_construct_elements_of_array(T *ptr, T* src, std::size_t size) +{ + std::size_t i=0; + EIGEN_TRY + { +#if EIGEN_HAS_RVALUE_REFERENCES + for (i = 0; i < size; ++i) ::new (ptr + i) T(std::move(*(src + i))); +#else + for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i)); +#endif + } + EIGEN_CATCH(...) + { + destruct_elements_of_array(ptr, i); + EIGEN_THROW; + } + return ptr; } /***************************************************************************** @@ -326,10 +365,10 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t s template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) { check_size_for_overflow(size); - T *result = reinterpret_cast(aligned_malloc(sizeof(T)*size)); + T *result = static_cast(aligned_malloc(sizeof(T)*size)); EIGEN_TRY { - return construct_elements_of_array(result, size); + return default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -342,10 +381,10 @@ template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size) { check_size_for_overflow(size); - T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + T *result = static_cast(conditional_aligned_malloc(sizeof(T)*size)); EIGEN_TRY { - return construct_elements_of_array(result, size); + return default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -377,21 +416,32 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned { check_size_for_overflow(new_size); check_size_for_overflow(old_size); - if(new_size < old_size) - destruct_elements_of_array(pts+new_size, old_size-new_size); - T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); - if(new_size > old_size) + + // If elements need to be explicitly initialized, we cannot simply realloc + // (or memcpy) the memory block - each element needs to be reconstructed. + // Otherwise, objects that contain internal pointers like mpfr or + // AnnoyingScalar can be pointing to the wrong thing. + T* result = static_cast(conditional_aligned_malloc(sizeof(T)*new_size)); + EIGEN_TRY { - EIGEN_TRY - { - construct_elements_of_array(result+old_size, new_size-old_size); - } - EIGEN_CATCH(...) - { - conditional_aligned_free(result); - EIGEN_THROW; + // Move-construct initial elements. + std::size_t copy_size = (std::min)(old_size, new_size); + move_construct_elements_of_array(result, pts, copy_size); + + // Default-construct remaining elements. + if (new_size > old_size) { + default_construct_elements_of_array(result + copy_size, new_size - old_size); } + + // Delete old elements. + conditional_aligned_delete(pts, old_size); } + EIGEN_CATCH(...) + { + conditional_aligned_free(result); + EIGEN_THROW; + } + return result; } @@ -401,12 +451,12 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned if(size==0) return 0; // short-cut. Also fixes Bug 884 check_size_for_overflow(size); - T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); + T *result = static_cast(conditional_aligned_malloc(sizeof(T)*size)); if(NumTraits::RequireInitialization) { EIGEN_TRY { - construct_elements_of_array(result, size); + default_construct_elements_of_array(result, size); } EIGEN_CATCH(...) { @@ -419,24 +469,13 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned template inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size) { + if (NumTraits::RequireInitialization) { + return conditional_aligned_realloc_new(pts, new_size, old_size); + } + check_size_for_overflow(new_size); check_size_for_overflow(old_size); - if(NumTraits::RequireInitialization && (new_size < old_size)) - destruct_elements_of_array(pts+new_size, old_size-new_size); - T *result = reinterpret_cast(conditional_aligned_realloc(reinterpret_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); - if(NumTraits::RequireInitialization && (new_size > old_size)) - { - EIGEN_TRY - { - construct_elements_of_array(result+old_size, new_size-old_size); - } - EIGEN_CATCH(...) - { - conditional_aligned_free(result); - EIGEN_THROW; - } - } - return result; + return static_cast(conditional_aligned_realloc(static_cast(pts), sizeof(T)*new_size, sizeof(T)*old_size)); } template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size) @@ -566,6 +605,17 @@ template struct smart_memmove_helper { } }; +#if EIGEN_HAS_RVALUE_REFERENCES +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::move(start, end, target); +} +#else +template EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) +{ + return std::copy(start, end, target); +} +#endif /***************************************************************************** *** Implementation of runtime stack allocation (falling back to malloc) *** @@ -606,7 +656,7 @@ template class aligned_stack_memory_handler : noncopyable : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) - Eigen::internal::construct_elements_of_array(m_ptr, size); + Eigen::internal::default_construct_elements_of_array(m_ptr, size); } EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() @@ -657,7 +707,7 @@ struct local_nested_eval_wrapper m_deallocate(ptr==0) { if(NumTraits::RequireInitialization && object.data()) - Eigen::internal::construct_elements_of_array(object.data(), object.size()); + Eigen::internal::default_construct_elements_of_array(object.data(), object.size()); object = xpr; } diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h index f66325f898d2f8d354a07bc3ee797929aaa83b6b..b7635f985291b20c0f9df8f5545d48056c0d3caf 100755 --- a/Eigen/src/Core/util/Meta.h +++ b/Eigen/src/Core/util/Meta.h @@ -133,7 +133,10 @@ template struct remove_all { typedef typename remove_all< template struct is_arithmetic { enum { value = false }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; +// GPU devices treat `long double` as `double`. +#ifndef EIGEN_GPU_COMPILE_PHASE template<> struct is_arithmetic { enum { value = true }; }; +#endif template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; @@ -189,21 +192,9 @@ template<> struct make_unsigned { typedef unsigned int type; } template<> struct make_unsigned { typedef unsigned int type; }; template<> struct make_unsigned { typedef unsigned long type; }; template<> struct make_unsigned { typedef unsigned long type; }; -#if EIGEN_COMP_MSVC -template<> struct make_unsigned { typedef unsigned __int64 type; }; -template<> struct make_unsigned { typedef unsigned __int64 type; }; -#endif - -// Some platforms define int64_t as long long even for C++03. In this case we -// are missing the definition for make_unsigned. If we just define it, we get -// duplicated definitions for platforms defining int64_t as signed long for -// C++03. We therefore add the specialization for C++03 long long for these -// platforms only. -#if EIGEN_OS_MAC template<> struct make_unsigned { typedef unsigned long long type; }; template<> struct make_unsigned { typedef unsigned long long type; }; #endif -#endif template struct add_const { typedef const T type; }; template struct add_const { typedef T& type; }; @@ -466,20 +457,32 @@ template struct array_size > { }; #endif + /** \internal - * Analogue of the std::size free function. - * It returns the size of the container or view \a x of type \c T + * Analogue of the std::ssize free function. + * It returns the signed size of the container or view \a x of type \c T * * It currently supports: * - any types T defining a member T::size() const * - plain C arrays as T[N] * + * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function. */ -template -EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); } +#if EIGEN_COMP_CXXVER < 20 +template +EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T& x) { + return static_cast(x.size()); +} -template -EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; } +template +EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { return N; } +#else +template +EIGEN_CONSTEXPR auto index_list_size(T&& x) { + using std::ssize; + return ssize(std::forward(x)); +} +#endif // EIGEN_COMP_CXXVER /** \internal * Convenient struct to get the result type of a nullary, unary, binary, or @@ -696,8 +699,7 @@ struct has_binary_operator template Y))) > - // use ?: instead of || just to shut up a stupid gcc 4.3 warning + bool Done = ((SupX - InfX) <= 1 || ((SupX * SupX <= Y) && ((SupX + 1) * (SupX + 1) > Y)))> class meta_sqrt { enum { @@ -715,20 +717,25 @@ class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? /** \internal Computes the least common multiple of two positive integer A and B - * at compile-time. It implements a naive algorithm testing all multiples of A. - * It thus works better if A>=B. + * at compile-time. */ -template +template=B)> struct meta_least_common_multiple { enum { ret = meta_least_common_multiple::ret }; }; +template +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple::ret }; +}; template -struct meta_least_common_multiple +struct meta_least_common_multiple { enum { ret = A*K }; }; + /** \internal determines whether the product of two numeric types is allowed and what the return type is */ template struct scalar_product_traits { diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h index 2c63a95243c4ae1ab3044f5146172ee3e6ca52e4..71c32b8a116ba86bf04a00e8c260ef912154a01a 100644 --- a/Eigen/src/Core/util/XprHelper.h +++ b/Eigen/src/Core/util/XprHelper.h @@ -184,19 +184,7 @@ template struct functor_traits template struct packet_traits; -template struct unpacket_traits -{ - typedef T type; - typedef T half; - enum - { - size = 1, - alignment = 1, - vectorizable = false, - masked_load_available=false, - masked_store_available=false - }; -}; +template struct unpacket_traits; template::size)==0 || is_same::half>::value> @@ -611,9 +599,9 @@ template MatrixRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType; typedef Array ArrayRowType; + int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType; typedef typename conditional< is_same< typename traits::XprKind, MatrixXpr >::value, diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index 87d789b3f4a815c113c1cbcdaeef1ac7b8c74f29..26324cee9d91576b6ee0e9a8de5ee9fda8907cd2 100644 --- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -119,8 +119,8 @@ template class GeneralizedEigenSolver : m_eivec(), m_alphas(), m_betas(), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ() {} @@ -134,8 +134,8 @@ template class GeneralizedEigenSolver : m_eivec(size, size), m_alphas(size), m_betas(size), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ(size), m_tmp(size) {} @@ -156,8 +156,8 @@ template class GeneralizedEigenSolver : m_eivec(A.rows(), A.cols()), m_alphas(A.cols()), m_betas(A.cols()), - m_valuesOkay(false), - m_vectorsOkay(false), + m_computeEigenvectors(false), + m_isInitialized(false), m_realQZ(A.cols()), m_tmp(A.cols()) { @@ -177,7 +177,8 @@ template class GeneralizedEigenSolver * \sa eigenvalues() */ EigenvectorsType eigenvectors() const { - eigen_assert(m_vectorsOkay && "Eigenvectors for GeneralizedEigenSolver were not calculated."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvectors"); + eigen_assert(m_computeEigenvectors && "Eigenvectors for GeneralizedEigenSolver were not calculated"); return m_eivec; } @@ -201,7 +202,7 @@ template class GeneralizedEigenSolver */ EigenvalueType eigenvalues() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvalues."); return EigenvalueType(m_alphas,m_betas); } @@ -212,7 +213,7 @@ template class GeneralizedEigenSolver * \sa betas(), eigenvalues() */ ComplexVectorType alphas() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute alphas."); return m_alphas; } @@ -223,7 +224,7 @@ template class GeneralizedEigenSolver * \sa alphas(), eigenvalues() */ VectorType betas() const { - eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized."); + eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute betas."); return m_betas; } @@ -254,7 +255,7 @@ template class GeneralizedEigenSolver ComputationInfo info() const { - eigen_assert(m_valuesOkay && "EigenSolver is not initialized."); + eigen_assert(m_isInitialized && "EigenSolver is not initialized."); return m_realQZ.info(); } @@ -277,7 +278,8 @@ template class GeneralizedEigenSolver EigenvectorsType m_eivec; ComplexVectorType m_alphas; VectorType m_betas; - bool m_valuesOkay, m_vectorsOkay; + bool m_computeEigenvectors; + bool m_isInitialized; RealQZ m_realQZ; ComplexVectorType m_tmp; }; @@ -292,8 +294,6 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp using std::abs; eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows()); Index size = A.cols(); - m_valuesOkay = false; - m_vectorsOkay = false; // Reduce to generalized real Schur form: // A = Q S Z and B = Q T Z m_realQZ.compute(A, B, computeEigenvectors); @@ -406,10 +406,9 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp i += 2; } } - - m_valuesOkay = true; - m_vectorsOkay = computeEigenvectors; } + m_computeEigenvectors = computeEigenvectors; + m_isInitialized = true; return *this; } diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h index d947dac4ebdb80a8d8184f5a30be9372736c61ae..1f21139346e7394b6c522dba3a4368125232fa99 100644 --- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h +++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h @@ -267,7 +267,7 @@ template class HessenbergDecomposition private: - typedef Matrix VectorType; + typedef Matrix VectorType; typedef typename NumTraits::Real RealScalar; static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp); diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index 59e59644eb370813b0e996ca6a6ca949897d16e5..14692365ffbf3a53eb67c0d64a961e098d14578b 100644 --- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -125,6 +125,7 @@ template class SelfAdjointEigenSolver : m_eivec(), m_eivalues(), m_subdiag(), + m_hcoeffs(), m_info(InvalidInput), m_isInitialized(false), m_eigenvectorsOk(false) @@ -147,6 +148,7 @@ template class SelfAdjointEigenSolver : m_eivec(size, size), m_eivalues(size), m_subdiag(size > 1 ? size - 1 : 1), + m_hcoeffs(size > 1 ? size - 1 : 1), m_isInitialized(false), m_eigenvectorsOk(false) {} @@ -172,6 +174,7 @@ template class SelfAdjointEigenSolver : m_eivec(matrix.rows(), matrix.cols()), m_eivalues(matrix.cols()), m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1), + m_hcoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1), m_isInitialized(false), m_eigenvectorsOk(false) { @@ -378,6 +381,7 @@ template class SelfAdjointEigenSolver EigenvectorsType m_eivec; RealVectorType m_eivalues; typename TridiagonalizationType::SubDiagonalType m_subdiag; + typename TridiagonalizationType::CoeffVectorType m_hcoeffs; ComputationInfo m_info; bool m_isInitialized; bool m_eigenvectorsOk; @@ -450,7 +454,8 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver if(scale==RealScalar(0)) scale = RealScalar(1); mat.template triangularView() /= scale; m_subdiag.resize(n-1); - internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors); + m_hcoeffs.resize(n-1); + internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, computeEigenvectors); m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec); diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h index 6c8084f76a60283c4576165a901785ed58a911e9..eda82794ab683fdc46c164971ede750cf469fdc0 100644 --- a/Eigen/src/Eigenvalues/Tridiagonalization.h +++ b/Eigen/src/Eigenvalues/Tridiagonalization.h @@ -425,12 +425,13 @@ struct tridiagonalization_inplace_selector; * * \sa class Tridiagonalization */ -template +template EIGEN_DEVICE_FUNC -void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) +void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, + CoeffVectorType& hcoeffs, bool extractQ) { eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1); - tridiagonalization_inplace_selector::run(mat, diag, subdiag, extractQ); + tridiagonalization_inplace_selector::run(mat, diag, subdiag, hcoeffs, extractQ); } /** \internal @@ -439,14 +440,12 @@ void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonal template struct tridiagonalization_inplace_selector { - typedef typename Tridiagonalization::CoeffVectorType CoeffVectorType; typedef typename Tridiagonalization::HouseholderSequenceType HouseholderSequenceType; - template + template static EIGEN_DEVICE_FUNC - void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ) { - CoeffVectorType hCoeffs(mat.cols()-1); - tridiagonalization_inplace(mat,hCoeffs); + tridiagonalization_inplace(mat, hCoeffs); diag = mat.diagonal().real(); subdiag = mat.template diagonal<-1>().real(); if(extractQ) @@ -466,8 +465,8 @@ struct tridiagonalization_inplace_selector typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; - template - static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ) + template + static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&, bool extractQ) { using std::sqrt; const RealScalar tol = (std::numeric_limits::min)(); @@ -511,9 +510,9 @@ struct tridiagonalization_inplace_selector { typedef typename MatrixType::Scalar Scalar; - template + template static EIGEN_DEVICE_FUNC - void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ) + void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, CoeffVectorType&, bool extractQ) { diag(0,0) = numext::real(mat(0,0)); if(extractQ) diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h index 6b755008fdc5a4e68eec9abfd5752e28b8b56282..2a5c395b25d448dd7f6cdd3591e197f1eecde405 100644 --- a/Eigen/src/Geometry/Umeyama.h +++ b/Eigen/src/Geometry/Umeyama.h @@ -136,8 +136,10 @@ umeyama(const MatrixBase& src, const MatrixBase& dst, boo // Eq. (39) VectorType S = VectorType::Ones(m); - if ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) - S(m-1) = -1; + if ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) { + Index tmp = m - 1; + S(tmp) = -1; + } // Eq. (40) and (43) Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose(); diff --git a/Eigen/src/Geometry/arch/Geometry_SIMD.h b/Eigen/src/Geometry/arch/Geometry_SIMD.h index 9c15bfb98b4ba732893e75022f7aa5b2f0c7526a..9af6a9af720d8177759139b33f8f7db80e39f6fa 100644 --- a/Eigen/src/Geometry/arch/Geometry_SIMD.h +++ b/Eigen/src/Geometry/arch/Geometry_SIMD.h @@ -28,8 +28,9 @@ struct quat_product evaluator ae(_a.coeffs()); evaluator be(_b.coeffs()); Quaternion res; - float arr[4] = {0.f, 0.f, 0.f, -0.f}; - const Packet4f mask = pset(arr); + const float neg_zero = numext::bit_cast(0x80000000u); + const float arr[4] = {0.f, 0.f, 0.f, neg_zero}; + const Packet4f mask = ploadu(arr); Packet4f a = ae.template packet(0); Packet4f b = be.template packet(0); Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); @@ -55,8 +56,9 @@ struct quat_conj { evaluator qe(q.coeffs()); Quaternion res; - float arr[4] = {-0.f,-0.f,-0.f,0.f}; - const Packet4f mask = pset(arr); + const float neg_zero = numext::bit_cast(0x80000000u); + const float arr[4] = {neg_zero, neg_zero, neg_zero,0.f}; + const Packet4f mask = ploadu(arr); pstoret(&res.x(), pxor(mask, qe.template packet::Alignment,Packet4f>(0))); return res; } @@ -146,10 +148,11 @@ struct quat_conj { evaluator qe(q.coeffs()); Quaternion res; - double arr1[2] = {-0.0, -0.0}; - double arr2[2] = {-0.0, 0.0}; - const Packet2d mask0 = pset(arr1); - const Packet2d mask2 = pset(arr2); + const double neg_zero = numext::bit_cast(0x8000000000000000ull); + const double arr1[2] = {neg_zero, neg_zero}; + const double arr2[2] = {neg_zero, 0.0}; + const Packet2d mask0 = ploadu(arr1); + const Packet2d mask2 = ploadu(arr2); pstoret(&res.x(), pxor(mask0, qe.template packet::Alignment,Packet2d>(0))); pstoret(&res.z(), pxor(mask2, qe.template packet::Alignment,Packet2d>(2))); return res; diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h index 5bc037f00d18c992606fed9fef11f989fec373d5..d8984a347f6e6bb10a9d9f06b80315aac7b207e7 100644 --- a/Eigen/src/Householder/Householder.h +++ b/Eigen/src/Householder/Householder.h @@ -69,7 +69,7 @@ void MatrixBase::makeHouseholder( Scalar& tau, RealScalar& beta) const { - using std::sqrt; + using numext::sqrt; using numext::conj; EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart) diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h index 153acef65ba921c1ecec39ff670ba2c32b1dfef6..1c9ade5623f9a9ad9c3ed4065073935a81e49441 100644 --- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h +++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h @@ -49,9 +49,9 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, x.setZero(); return true; } - Scalar rho = 1; - Scalar alpha = 1; - Scalar w = 1; + Scalar rho (1); + Scalar alpha (1); + Scalar w (1); VectorType v = VectorType::Zero(n), p = VectorType::Zero(n); VectorType y(n), z(n); diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 5d8c6b4339ee592b24f6092cfa67a53c51a2a0da..c3ca0ad54c0d0967dc5c3bd399f00a44fa9bc1b9 100644 --- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -29,8 +29,6 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, const Preconditioner& precond, Index& iters, typename Dest::RealScalar& tol_error) { - using std::sqrt; - using std::abs; typedef typename Dest::RealScalar RealScalar; typedef typename Dest::Scalar Scalar; typedef Matrix VectorType; @@ -56,7 +54,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, if (residualNorm2 < threshold) { iters = 0; - tol_error = sqrt(residualNorm2 / rhsNorm2); + tol_error = numext::sqrt(residualNorm2 / rhsNorm2); return; } @@ -86,7 +84,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, p = z + beta * p; // update search direction i++; } - tol_error = sqrt(residualNorm2 / rhsNorm2); + tol_error = numext::sqrt(residualNorm2 / rhsNorm2); iters = i; } diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h index 7803fd8170f2f3772fa57abc22726a236026d9ba..5e632c4e2f3555d72569eba6de97d8631747199a 100644 --- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h +++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h @@ -160,13 +160,13 @@ class IncompleteCholesky : public SparseSolverBase @@ -143,13 +144,18 @@ inline void compute_inverse_size3_helper( const Matrix& cofactors_col0, ResultType& result) { - result.row(0) = cofactors_col0 * invdet; - result.coeffRef(1,0) = cofactor_3x3(matrix) * invdet; - result.coeffRef(1,1) = cofactor_3x3(matrix) * invdet; + // Compute cofactors in a way that avoids aliasing issues. + typedef typename ResultType::Scalar Scalar; + const Scalar c01 = cofactor_3x3(matrix) * invdet; + const Scalar c11 = cofactor_3x3(matrix) * invdet; + const Scalar c02 = cofactor_3x3(matrix) * invdet; result.coeffRef(1,2) = cofactor_3x3(matrix) * invdet; - result.coeffRef(2,0) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,1) = cofactor_3x3(matrix) * invdet; result.coeffRef(2,2) = cofactor_3x3(matrix) * invdet; + result.coeffRef(1,0) = c01; + result.coeffRef(1,1) = c11; + result.coeffRef(2,0) = c02; + result.row(0) = cofactors_col0 * invdet; } template @@ -181,14 +187,13 @@ struct compute_inverse_and_det_with_check bool& invertible ) { - using std::abs; typedef typename ResultType::Scalar Scalar; Matrix cofactors_col0; cofactors_col0.coeffRef(0) = cofactor_3x3(matrix); cofactors_col0.coeffRef(1) = cofactor_3x3(matrix); cofactors_col0.coeffRef(2) = cofactor_3x3(matrix); determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum(); - invertible = abs(determinant) > absDeterminantThreshold; + invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold; if(!invertible) return; const Scalar invdet = Scalar(1) / determinant; compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse); @@ -273,7 +278,13 @@ struct compute_inverse_and_det_with_check using std::abs; determinant = matrix.determinant(); invertible = abs(determinant) > absDeterminantThreshold; - if(invertible) compute_inverse::run(matrix, inverse); + if(invertible && extract_data(matrix) != extract_data(inverse)) { + compute_inverse::run(matrix, inverse); + } + else if(invertible) { + MatrixType matrix_t = matrix; + compute_inverse::run(matrix_t, inverse); + } } }; @@ -347,6 +358,8 @@ inline const Inverse MatrixBase::inverse() const * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param determinant Reference to the variable in which to store the determinant. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. @@ -387,6 +400,8 @@ inline void MatrixBase::computeInverseAndDetWithCheck( * * This is only for fixed-size square matrices of size up to 4x4. * + * Notice that it will trigger a copy of input matrix when trying to do the inverse in place. + * * \param inverse Reference to the matrix in which to store the inverse. * \param invertible Reference to the bool variable in which to store whether the matrix is invertible. * \param absDeterminantThreshold Optional parameter controlling the invertibility check. diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h index 46ffdd3202a30b795a869a1a9a396098f676b727..34aed72494d4315d7ba22950445c26fc27b2fbae 100644 --- a/Eigen/src/LU/PartialPivLU.h +++ b/Eigen/src/LU/PartialPivLU.h @@ -504,8 +504,13 @@ struct partial_lu_impl template void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions) { + // Special-case of zero matrix. + if (lu.rows() == 0 || lu.cols() == 0) { + nb_transpositions = 0; + return; + } eigen_assert(lu.cols() == row_transpositions.size()); - eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); + eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1); partial_lu_impl < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h index 5a8d0c1145194736849858d546090c849d695270..22ae38ac6870ae8694bcad5e4fe7a4d507b791df 100644 --- a/Eigen/src/LU/arch/InverseSize4.h +++ b/Eigen/src/LU/arch/InverseSize4.h @@ -35,6 +35,13 @@ #ifndef EIGEN_INVERSE_SIZE_4_H #define EIGEN_INVERSE_SIZE_4_H +#if EIGEN_COMP_GNUC_STRICT +// These routines requires bit manipulation of the sign, which is not compatible +// with fastmath. +#pragma GCC push_options +#pragma GCC optimize ("no-fast-math") +#endif + namespace Eigen { namespace internal @@ -54,10 +61,12 @@ struct compute_inverse_size4(0); - Packet4f _L2 = matrix.template packet(4); - Packet4f _L3 = matrix.template packet(8); - Packet4f _L4 = matrix.template packet(12); + const float* data = matrix.data(); + const Index stride = matrix.innerStride(); + Packet4f _L1 = ploadt(data); + Packet4f _L2 = ploadt(data + stride*4); + Packet4f _L3 = ploadt(data + stride*8); + Packet4f _L4 = ploadt(data + stride*12); // Four 2x2 sub-matrices of the input matrix // input = [[A, B], @@ -141,8 +150,8 @@ struct compute_inverse_size4(sign_mask); + EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f}; + const Packet4f p4f_sign_PNNP = pload(sign_mask); rd = pxor(rd, p4f_sign_PNNP); iA = pmul(iA, rd); iB = pmul(iB, rd); @@ -189,25 +198,26 @@ struct compute_inverse_size4(0); - B1 = matrix.template packet(2); - A2 = matrix.template packet(4); - B2 = matrix.template packet(6); - C1 = matrix.template packet(8); - D1 = matrix.template packet(10); - C2 = matrix.template packet(12); - D2 = matrix.template packet(14); + A1 = ploadt(data + stride*0); + B1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + B2 = ploadt(data + stride*6); + C1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + C2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); } else { Packet2d temp; - A1 = matrix.template packet(0); - C1 = matrix.template packet(2); - A2 = matrix.template packet(4); - C2 = matrix.template packet(6); - + A1 = ploadt(data + stride*0); + C1 = ploadt(data + stride*2); + A2 = ploadt(data + stride*4); + C2 = ploadt(data + stride*6); temp = A1; A1 = vec2d_unpacklo(A1, A2); A2 = vec2d_unpackhi(temp, A2); @@ -216,10 +226,10 @@ struct compute_inverse_size4(8); - D1 = matrix.template packet(10); - B2 = matrix.template packet(12); - D2 = matrix.template packet(14); + B1 = ploadt(data + stride*8); + D1 = ploadt(data + stride*10); + B2 = ploadt(data + stride*12); + D2 = ploadt(data + stride*14); temp = B1; B1 = vec2d_unpacklo(B1, B2); @@ -323,10 +333,10 @@ struct compute_inverse_size4(sign_mask1); - const Packet2d sign_NP = pset(sign_mask2); + EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0}; + EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0}; + const Packet2d sign_PN = pload(sign_mask1); + const Packet2d sign_NP = pload(sign_mask2); d1 = pxor(rd, sign_PN); d2 = pxor(rd, sign_NP); @@ -345,4 +355,9 @@ struct compute_inverse_size4 > int m_ordering; // Ordering method to use, see SPQR's manual int m_allow_tol; // Allow to use some tolerance during numerical factorization. RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero - mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format + mutable cholmod_sparse *m_cR = nullptr; // The sparse R factor in cholmod format mutable MatrixType m_R; // The sparse matrix R in Eigen format - mutable StorageIndex *m_E; // The permutation applied to columns - mutable cholmod_sparse *m_H; //The householder vectors - mutable StorageIndex *m_HPinv; // The row permutation of H - mutable cholmod_dense *m_HTau; // The Householder coefficients + mutable StorageIndex *m_E = nullptr; // The permutation applied to columns + mutable cholmod_sparse *m_H = nullptr; //The householder vectors + mutable StorageIndex *m_HPinv = nullptr; // The row permutation of H + mutable cholmod_dense *m_HTau = nullptr; // The Householder coefficients mutable Index m_rank; // The rank of the matrix mutable cholmod_common m_cc; // Workspace and parameters bool m_useDefaultThreshold; // Use default threshold diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h index 17f8e44364ddeb2ddcf41ede841868f10595504c..79a6562b7cf469a028bfa37fbaf6736dd9f53bd4 100644 --- a/Eigen/src/SVD/BDCSVD.h +++ b/Eigen/src/SVD/BDCSVD.h @@ -27,6 +27,10 @@ #define eigen_internal_assert(X) assert(X); #endif +#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE +#include +#endif + namespace Eigen { #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE @@ -172,7 +176,7 @@ public: void setSwitchSize(int s) { - eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3"); + eigen_assert(s>=3 && "BDCSVD the size of the algo switch has to be at least 3."); m_algoswap = s; } @@ -404,7 +408,7 @@ void BDCSVD::structured_update(Block A, co //@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; // lastCol + 1 - firstCol is the size of the submatrix. //@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W) -//@param firstRowW : Same as firstRowW with the column. +//@param firstColW : Same as firstRowW with the column. //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper. template @@ -899,7 +903,7 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); eigen_internal_assert(fLeft::computeSingVals(const ArrayRef& col0, const ArrayRef& d // perturb singular value slightly if it equals diagonal entry to avoid division by zero later // (deflation is supposed to avoid this from happening) // - this does no seem to be necessary anymore - -// if (singVals[k] == left) singVals[k] *= 1 + NumTraits::epsilon(); -// if (singVals[k] == right) singVals[k] *= 1 - NumTraits::epsilon(); + // if (singVals[k] == left) singVals[k] *= 1 + NumTraits::epsilon(); + // if (singVals[k] == right) singVals[k] *= 1 - NumTraits::epsilon(); } } @@ -1029,7 +1033,14 @@ void BDCSVD::perturbCol0 std::cout << " " << "j=" << j << "\n"; } #endif - Index j = i= k && l == 0) { + m_info = NumericalIssue; + prod = 0; + break; + } + Index j = i 0 ? perm(l-1) : i; #ifdef EIGEN_BDCSVD_SANITY_CHECKS if(!(dk!=Literal(0) || diag(i)!=Literal(0))) { @@ -1242,8 +1253,8 @@ void BDCSVD::deflation(Eigen::Index firstCol, Eigen::Index lastCol, #endif { // Check for total deflation - // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting - bool total_deflation = (col0.tail(length-1).array() - TransposeTypeWithSameStorageOrder; + + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -202,13 +202,12 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) - : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) - : MatrixType::Options + Options = MatrixType::Options }; - typedef Matrix - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -303,8 +302,9 @@ public: Options = MatrixType::Options }; - typedef Matrix - TransposeTypeWithSameStorageOrder; + typedef typename internal::make_proper_matrix_type< + Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime + >::type TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) { @@ -680,6 +680,7 @@ JacobiSVD::compute(const MatrixType& matrix, unsig if (!(numext::isfinite)(scale)) { m_isInitialized = true; m_info = InvalidInput; + m_nonzeroSingularValues = 0; return *this; } if(scale==RealScalar(0)) scale = RealScalar(1); diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h index 997defc474419f5428529010054cfbed9b93bb96..a5b2f60d2122f7fedcc2bc4f1ce28a98828ec2f5 100644 --- a/Eigen/src/SVD/UpperBidiagonalization.h +++ b/Eigen/src/SVD/UpperBidiagonalization.h @@ -161,13 +161,14 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, typedef typename MatrixType::Scalar Scalar; typedef typename MatrixType::RealScalar RealScalar; typedef typename NumTraits::Literal Literal; - enum { StorageOrder = traits::Flags & RowMajorBit }; - typedef InnerStride ColInnerStride; - typedef InnerStride RowInnerStride; + static const int StorageOrder = + (traits::Flags & RowMajorBit) ? RowMajor : ColMajor; + typedef InnerStride ColInnerStride; + typedef InnerStride RowInnerStride; typedef Ref, 0, ColInnerStride> SubColumnType; typedef Ref, 0, RowInnerStride> SubRowType; typedef Ref > SubMatType; - + Index brows = A.rows(); Index bcols = A.cols(); @@ -293,7 +294,7 @@ void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagona Index size = (std::min)(rows, cols); // X and Y are work space - enum { StorageOrder = traits::Flags & RowMajorBit }; + enum { StorageOrder = (traits::Flags & RowMajorBit) ? RowMajor : ColMajor }; Matrix CholMatrixType tmp(size,size); ConstCholMatrixPtr pmat; - if(m_P.size()==0 && (UpLo&Upper)==Upper) + if(m_P.size() == 0 && (int(UpLo) & int(Upper)) == Upper) { // If there is no ordering, try to directly use the input matrix without any copy internal::simplicial_cholesky_grab_input::run(a, pmat, tmp); diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h index 5b4f6cc9f3f81e4857ba64021c4c34a8e0f23af9..c16caec704e66e0530f48a1d209f2009f28f979f 100644 --- a/Eigen/src/SparseCore/SparseBlock.h +++ b/Eigen/src/SparseCore/SparseBlock.h @@ -429,12 +429,7 @@ struct unary_evaluator, IteratorBa enum { IsRowMajor = XprType::IsRowMajor, - - OuterVector = (BlockCols==1 && ArgType::IsRowMajor) - | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&". - // revert to || as soon as not needed anymore. - (BlockRows==1 && !ArgType::IsRowMajor), - + OuterVector = (BlockCols == 1 && ArgType::IsRowMajor) || (BlockRows == 1 && !ArgType::IsRowMajor), CoeffReadCost = evaluator::CoeffReadCost, Flags = XprType::Flags }; diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index 6130bab430be7dbd3f6953b0006248a80a52399d..9b0d3f98dcd7c9ad9d446d65bc3de39e7b03c90a 100644 --- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -126,7 +126,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -211,7 +211,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -298,7 +298,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -457,7 +457,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -530,7 +530,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -604,7 +604,7 @@ public: enum { - CoeffReadCost = evaluator::CoeffReadCost + evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index df6c28d2b89be903c1e2b98e5a38d6b8fd6ff06b..32dac0f7863eb8b572c1fd4bf21752755a034aaf 100644 --- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -24,7 +24,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; @@ -79,7 +79,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator; enum { - CoeffReadCost = evaluator::CoeffReadCost + functor_traits::Cost, + CoeffReadCost = int(evaluator::CoeffReadCost) + int(functor_traits::Cost), Flags = XprType::Flags }; diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h index f99be3379db2ac32d9cc4dd18f86c3ecf138998a..6247d79bd9a525d9e26f80eac20ecafcafaa227e 100644 --- a/Eigen/src/SparseCore/SparseMap.h +++ b/Eigen/src/SparseCore/SparseMap.h @@ -237,6 +237,7 @@ class Map /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients, * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr. * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed. + * The inner indices must be sorted appropriately. * * This constructor is available only if \c SparseMatrixType is non-const. * diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h index 616b4a0c24e7b7b2e725bd4a3e7b62388a00e2a4..9fc06b5e712f440cb5746f53e79b7601602c2733 100644 --- a/Eigen/src/SparseCore/SparseMatrix.h +++ b/Eigen/src/SparseCore/SparseMatrix.h @@ -793,6 +793,7 @@ class SparseMatrix template EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase& other); +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m) { EIGEN_DBG_SPARSE( @@ -837,6 +838,7 @@ class SparseMatrix s << static_cast&>(m); return s; } +#endif /** Destructor */ inline ~SparseMatrix() diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h index 229449f02274463f3242ea2283faf3a93ab006cb..417a2365e04173301ae64fb257db2637c00d6431 100644 --- a/Eigen/src/SparseCore/SparseMatrixBase.h +++ b/Eigen/src/SparseCore/SparseMatrixBase.h @@ -113,7 +113,7 @@ template class SparseMatrixBase Transpose >::type AdjointReturnType; typedef Transpose TransposeReturnType; - typedef typename internal::add_const >::type ConstTransposeReturnType; + typedef Transpose ConstTransposeReturnType; // FIXME storage order do not match evaluator storage order typedef SparseMatrix PlainObject; @@ -214,7 +214,7 @@ template class SparseMatrixBase inline void assignGeneric(const OtherDerived& other); public: - +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m) { typedef typename Derived::Nested Nested; @@ -263,6 +263,7 @@ template class SparseMatrixBase } return s; } +#endif template Derived& operator+=(const SparseMatrixBase& other); diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h index 88820a48f364a8a7663ad37d0a9505550cdcba44..25ce404b8976d3e2ed83fbb8b499b13c456c682a 100644 --- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h +++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h @@ -90,9 +90,9 @@ struct sparse_sparse_product_with_pruning_selector::type _res(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, _res, tolerance); - res.swap(_res); + typename remove_all::type res_(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, res_, tolerance); + res.swap(res_); } }; @@ -104,9 +104,9 @@ struct sparse_sparse_product_with_pruning_selector SparseTemporaryType; - SparseTemporaryType _res(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, _res, tolerance); - res = _res; + SparseTemporaryType res_(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl(lhs, rhs, res_, tolerance); + res = res_; } }; @@ -117,9 +117,9 @@ struct sparse_sparse_product_with_pruning_selector::type _res(res.rows(), res.cols()); - internal::sparse_sparse_product_with_pruning_impl(rhs, lhs, _res, tolerance); - res.swap(_res); + typename remove_all::type res_(res.rows(), res.cols()); + internal::sparse_sparse_product_with_pruning_impl(rhs, lhs, res_, tolerance); + res.swap(res_); } }; @@ -137,9 +137,9 @@ struct sparse_sparse_product_with_pruning_selector SparseTemporaryType; -// SparseTemporaryType _res(res.cols(), res.rows()); -// sparse_sparse_product_with_pruning_impl(rhs, lhs, _res); -// res = _res.transpose(); +// SparseTemporaryType res_(res.cols(), res.rows()); +// sparse_sparse_product_with_pruning_impl(rhs, lhs, res_); +// res = res_.transpose(); } }; diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h index 05779be685b8f39d70f761e8064c7f0c2665d7b6..106925be4e51959432990a7d6696a97f4f39763f 100644 --- a/Eigen/src/SparseCore/SparseVector.h +++ b/Eigen/src/SparseCore/SparseVector.h @@ -329,6 +329,7 @@ class SparseVector } #endif +#ifndef EIGEN_NO_IO friend std::ostream & operator << (std::ostream & s, const SparseVector& m) { for (Index i=0; i } - Index count = 0; +// Index count = 0; // FIXME compute a reference value to filter zeros for (typename AmbiVector::Iterator it(tempVector/*,1e-12*/); it; ++it) { - ++ count; +// ++ count; // std::cerr << "fill " << it.index() << ", " << col << "\n"; // std::cout << it.value() << " "; // FIXME use insertBack diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h index 0c8d8939be2e21089f6883341b4339d5f4868536..6eb79502fdc38772b190dd5e527ec539dc813222 100644 --- a/Eigen/src/SparseLU/SparseLU.h +++ b/Eigen/src/SparseLU/SparseLU.h @@ -35,9 +35,10 @@ public: MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime }; - SparseLUTransposeView() : m_sparseLU(NULL) {} - SparseLUTransposeView(const SparseLUTransposeView& view) { + SparseLUTransposeView() : APIBase(), m_sparseLU(NULL) {} + SparseLUTransposeView(const SparseLUTransposeView& view) : APIBase() { this->m_sparseLU = view.m_sparseLU; + this->m_isInitialized = view.m_isInitialized; } void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;} void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;} @@ -752,10 +753,13 @@ void SparseLU::factorize(const MatrixType& matrix) info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu); if ( info ) { - m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR ... ZERO COLUMN AT "; + m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR"; +#ifndef EIGEN_NO_IO std::ostringstream returnInfo; - returnInfo << info; + returnInfo << " ... ZERO COLUMN AT "; + returnInfo << info; m_lastError += returnInfo.str(); +#endif m_info = NumericalIssue; m_factorizationIsOk = false; return; @@ -830,7 +834,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator template void solveInPlace(MatrixBase &X) const { Index nrhs = X.cols(); - Index n = X.rows(); // Backward solve with U for (Index k = m_mapL.nsuper(); k >= 0; k--) { @@ -850,7 +853,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator { // FIXME: the following lines should use Block expressions and not Map! Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); U = A.template triangularView().solve(U); } @@ -873,7 +876,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator { using numext::conj; Index nrhs = X.cols(); - Index n = X.rows(); // Forward solve with U for (Index k = 0; k <= m_mapL.nsuper(); k++) { @@ -904,7 +906,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator else { Map, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); if(Conjugate) U = A.adjoint().template triangularView().solve(U); else diff --git a/Eigen/src/SparseLU/SparseLU_Structs.h b/Eigen/src/SparseLU/SparseLU_Structs.h index cf5ec449bec3474860b49c3348a90c15cb15fed2..16a0c41f4654c2cf912b7d75346bd7c318b455c9 100644 --- a/Eigen/src/SparseLU/SparseLU_Structs.h +++ b/Eigen/src/SparseLU/SparseLU_Structs.h @@ -70,8 +70,8 @@ #define EIGEN_LU_STRUCTS namespace Eigen { namespace internal { - -typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; + +enum MemType {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL}; template struct LU_GlobalLU_t { diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h index 0be293d17fa51110a211fb99b22ad3e8c67c4560..fd5e9fa51ce01fe5652295471196b4a1daeb831f 100644 --- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h +++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h @@ -274,9 +274,8 @@ void MappedSuperNodalMatrix::solveInPlace( MatrixBase&X) co // Triangular solve Map, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); - U = A.template triangularView().solve(U); - + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); + U = A.template triangularView().solve(U); // Matrix-vector product new (&A) Map, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); work.topRows(nrow).noalias() = A * U; @@ -349,7 +348,7 @@ void MappedSuperNodalMatrix::solveTransposedInPlace( MatrixBase, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) ); - Map< Matrix, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); + typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc); if(Conjugate) U = U - A.adjoint() * work.topRows(nrow); else diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h deleted file mode 100644 index e37c2fe0d028482549db8186a6650745ce0f6db7..0000000000000000000000000000000000000000 --- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h +++ /dev/null @@ -1,280 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2012 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H -#define EIGEN_SPARSELU_GEMM_KERNEL_H - -namespace Eigen { - -namespace internal { - - -/** \internal - * A general matrix-matrix product kernel optimized for the SparseLU factorization. - * - A, B, and C must be column major - * - lda and ldc must be multiples of the respective packet size - * - C must have the same alignment as A - */ -template -EIGEN_DONT_INLINE -void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc) -{ - using namespace Eigen::internal; - - typedef typename packet_traits::type Packet; - enum { - NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS, - PacketSize = packet_traits::size, - PM = 8, // peeling in M - RN = 2, // register blocking - RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking - BM = 4096/sizeof(Scalar), // number of rows of A-C per chunk - SM = PM*PacketSize // step along M - }; - Index d_end = (d/RK)*RK; // number of columns of A (rows of B) suitable for full register blocking - Index n_end = (n/RN)*RN; // number of columns of B-C suitable for processing RN columns at once - Index i0 = internal::first_default_aligned(A,m); - - eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m))); - - // handle the non aligned rows of A and C without any optimization: - for(Index i=0; i(BM, m-ib); // actual number of rows - Index actual_b_end1 = (actual_b/SM)*SM; // actual number of rows suitable for peeling - Index actual_b_end2 = (actual_b/PacketSize)*PacketSize; // actual number of rows suitable for vectorization - - // Let's process two columns of B-C at once - for(Index j=0; j(Bc0[0]); } - { b10 = pset1(Bc0[1]); } - if(RK==4) { b20 = pset1(Bc0[2]); } - if(RK==4) { b30 = pset1(Bc0[3]); } - { b01 = pset1(Bc1[0]); } - { b11 = pset1(Bc1[1]); } - if(RK==4) { b21 = pset1(Bc1[2]); } - if(RK==4) { b31 = pset1(Bc1[3]); } - - Packet a0, a1, a2, a3, c0, c1, t0, t1; - - const Scalar* A0 = A+ib+(k+0)*lda; - const Scalar* A1 = A+ib+(k+1)*lda; - const Scalar* A2 = A+ib+(k+2)*lda; - const Scalar* A3 = A+ib+(k+3)*lda; - - Scalar* C0 = C+ib+(j+0)*ldc; - Scalar* C1 = C+ib+(j+1)*ldc; - - a0 = pload(A0); - a1 = pload(A1); - if(RK==4) - { - a2 = pload(A2); - a3 = pload(A3); - } - else - { - // workaround "may be used uninitialized in this function" warning - a2 = a3 = a0; - } - -#define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);} -#define WORK(I) \ - c0 = pload(C0+i+(I)*PacketSize); \ - c1 = pload(C1+i+(I)*PacketSize); \ - KMADD(c0, a0, b00, t0) \ - KMADD(c1, a0, b01, t1) \ - a0 = pload(A0+i+(I+1)*PacketSize); \ - KMADD(c0, a1, b10, t0) \ - KMADD(c1, a1, b11, t1) \ - a1 = pload(A1+i+(I+1)*PacketSize); \ - if(RK==4){ KMADD(c0, a2, b20, t0) }\ - if(RK==4){ KMADD(c1, a2, b21, t1) }\ - if(RK==4){ a2 = pload(A2+i+(I+1)*PacketSize); }\ - if(RK==4){ KMADD(c0, a3, b30, t0) }\ - if(RK==4){ KMADD(c1, a3, b31, t1) }\ - if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ - pstore(C0+i+(I)*PacketSize, c0); \ - pstore(C1+i+(I)*PacketSize, c1) - - // process rows of A' - C' with aggressive vectorization and peeling - for(Index i=0; i0) - { - const Scalar* Bc0 = B+(n-1)*ldb; - - for(Index k=0; k(Bc0[0]); - b10 = pset1(Bc0[1]); - if(RK==4) b20 = pset1(Bc0[2]); - if(RK==4) b30 = pset1(Bc0[3]); - - Packet a0, a1, a2, a3, c0, t0/*, t1*/; - - const Scalar* A0 = A+ib+(k+0)*lda; - const Scalar* A1 = A+ib+(k+1)*lda; - const Scalar* A2 = A+ib+(k+2)*lda; - const Scalar* A3 = A+ib+(k+3)*lda; - - Scalar* C0 = C+ib+(n_end)*ldc; - - a0 = pload(A0); - a1 = pload(A1); - if(RK==4) - { - a2 = pload(A2); - a3 = pload(A3); - } - else - { - // workaround "may be used uninitialized in this function" warning - a2 = a3 = a0; - } - -#define WORK(I) \ - c0 = pload(C0+i+(I)*PacketSize); \ - KMADD(c0, a0, b00, t0) \ - a0 = pload(A0+i+(I+1)*PacketSize); \ - KMADD(c0, a1, b10, t0) \ - a1 = pload(A1+i+(I+1)*PacketSize); \ - if(RK==4){ KMADD(c0, a2, b20, t0) }\ - if(RK==4){ a2 = pload(A2+i+(I+1)*PacketSize); }\ - if(RK==4){ KMADD(c0, a3, b30, t0) }\ - if(RK==4){ a3 = pload(A3+i+(I+1)*PacketSize); }\ - pstore(C0+i+(I)*PacketSize, c0); - - // aggressive vectorization and peeling - for(Index i=0; i0) - { - for(Index j=0; j1 ? Aligned : 0 - }; - typedef Map, Alignment > MapVector; - typedef Map, Alignment > ConstMapVector; - if(rd==1) MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b); - - else if(rd==2) MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) - + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b); - - else MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b) - + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b) - + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b); - } - } - - } // blocking on the rows of A and C -} -#undef KMADD - -} // namespace internal - -} // namespace Eigen - -#endif // EIGEN_SPARSELU_GEMM_KERNEL_H diff --git a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h index 6f75d500e5f831f414175ce46dbceffa0acd5539..7aecbcad8ed2703000d62cfd5d88d983c69a7423 100644 --- a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h +++ b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h @@ -75,8 +75,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe // Identify the relaxed supernodes by postorder traversal of the etree Index snode_start; // beginning of a snode StorageIndex k; - Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree - Index nsuper_et = 0; // Number of relaxed snodes in the original etree StorageIndex l; for (j = 0; j < n; ) { @@ -88,7 +86,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe parent = et(j); } // Found a supernode in postordered etree, j is the last column - ++nsuper_et_post; k = StorageIndex(n); for (Index i = snode_start; i <= j; ++i) k = (std::min)(k, inv_post(i)); @@ -97,7 +94,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe { // This is also a supernode in the original etree relax_end(k) = l; // Record last column - ++nsuper_et; } else { @@ -107,7 +103,6 @@ void SparseLUImpl::heap_relax_snode (const Index n, IndexVe if (descendants(i) == 0) { relax_end(l) = l; - ++nsuper_et; } } } diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h index 8c1b3e8bc67c89ea80b81f22691695cf6cebf90c..7a101ea0c862bf389c0aaf0958379eb598a7e92e 100644 --- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h @@ -69,8 +69,7 @@ EIGEN_DONT_INLINE void LU_kernel_bmod::run(const Index seg Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize; Map, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) ); - l.setZero(); - internal::sparselu_gemm(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride()); + l.noalias() = B * u; // Scatter tempv[] into SPA dense[] as a temporary storage isub = lptr + no_zeros; diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h index f052001c8f8880c17909959687bd2be4a97ae02a..92cdb0e45c457fcbb44ec0efe32888b4cef51f5e 100644 --- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h +++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h @@ -148,8 +148,7 @@ void SparseLUImpl::panel_bmod(const Index m, const Index w, Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize; MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl)); - L.setZero(); - internal::sparselu_gemm(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride()); + L.noalias() = B * U; // scatter U and L u_col = 0; diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 0e5d5445b18fcda01a43e33e958a9a964b8d1354..1b422e2015aab9ced8e029d789615fd1922512bc 100644 --- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -30,15 +30,53 @@ operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const * * \sa max() */ -EIGEN_MAKE_CWISE_BINARY_OP(min,min) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const OtherDerived &other) const +{ + return (min)(other); +} /** \returns an expression of the coefficient-wise min of \c *this and scalar \a other * * \sa max() */ +template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, - const CwiseNullaryOp, PlainObject> > +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +min +#else +(min) +#endif +(const Scalar &other) const +{ + return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); +} + +EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > #ifdef EIGEN_PARSED_BY_DOXYGEN min #else @@ -46,7 +84,7 @@ min #endif (const Scalar &other) const { - return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); + return (min)(Derived::PlainObject::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise max of \c *this and \a other @@ -56,14 +94,52 @@ min * * \sa min() */ -EIGEN_MAKE_CWISE_BINARY_OP(max,max) +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const OtherDerived &other) const +{ + return (max)(other); +} /** \returns an expression of the coefficient-wise max of \c *this and scalar \a other * * \sa min() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, + const CwiseNullaryOp, PlainObject> > +#ifdef EIGEN_PARSED_BY_DOXYGEN +max +#else +(max) +#endif +(const Scalar &other) const +{ + return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); +} + EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const CwiseNullaryOp, PlainObject> > #ifdef EIGEN_PARSED_BY_DOXYGEN max @@ -72,7 +148,7 @@ max #endif (const Scalar &other) const { - return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); + return (max)(Derived::PlainObject::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise absdiff of \c *this and \a other diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h index b7ea22a9d8aae2f51682c5272432e660787f772a..13c55f4b115880c0becb6246bdc1fb86c9f404fa 100644 --- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h +++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h @@ -497,6 +497,45 @@ ceil() const return CeilReturnType(derived()); } +template struct ShiftRightXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type arithmetically + * shifted right by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftLeft() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftRightXpr::Type +shiftRight() const +{ + return typename ShiftRightXpr::Type(derived()); +} + + +template struct ShiftLeftXpr { + typedef CwiseUnaryOp, const Derived> Type; +}; + +/** \returns an expression of \c *this with the \a Scalar type logically + * shifted left by \a N bit positions. + * + * The template parameter \a N specifies the number of bit positions to shift. + * + * \sa shiftRight() + */ +template +EIGEN_DEVICE_FUNC +typename ShiftLeftXpr::Type +shiftLeft() const +{ + return typename ShiftLeftXpr::Type(derived()); +} + /** \returns an expression of the coefficient-wise isnan of *this. * * Example: \include Cwise_isNaN.cpp diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h index 42ff901ca5252b1068be6a2e111b336bda6f9b7f..5418dc4154f2b078e964383d6bb5acebe0bf3210 100644 --- a/Eigen/src/plugins/CommonCwiseUnaryOps.h +++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h @@ -64,49 +64,6 @@ cast() const return typename CastXpr::Type(derived()); } -template struct ShiftRightXpr { - typedef CwiseUnaryOp, const Derived> Type; -}; - -/// \returns an expression of \c *this with the \a Scalar type arithmetically -/// shifted right by \a N bit positions. -/// -/// The template parameter \a N specifies the number of bit positions to shift. -/// -EIGEN_DOC_UNARY_ADDONS(cast,conversion function) -/// -/// \sa class CwiseUnaryOp -/// -template -EIGEN_DEVICE_FUNC -typename ShiftRightXpr::Type -shift_right() const -{ - return typename ShiftRightXpr::Type(derived()); -} - - -template struct ShiftLeftXpr { - typedef CwiseUnaryOp, const Derived> Type; -}; - -/// \returns an expression of \c *this with the \a Scalar type logically -/// shifted left by \a N bit positions. -/// -/// The template parameter \a N specifies the number of bit positions to shift. -/// -EIGEN_DOC_UNARY_ADDONS(cast,conversion function) -/// -/// \sa class CwiseUnaryOp -/// -template -EIGEN_DEVICE_FUNC -typename ShiftLeftXpr::Type -shift_left() const -{ - return typename ShiftLeftXpr::Type(derived()); -} - /// \returns an expression of the complex conjugate of \c *this. /// EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate) diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h index 5bfb19ac6cdfd5a66fdf15f001d72f7efc2a8300..15c35b0bf886fa964c38e86ce9386adefc01788a 100644 --- a/Eigen/src/plugins/IndexedViewMethods.h +++ b/Eigen/src/plugins/IndexedViewMethods.h @@ -90,8 +90,8 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND return BlockType(derived(), internal::first(actualRowIndices), internal::first(actualColIndices), - internal::size(actualRowIndices), - internal::size(actualColIndices)); + internal::index_list_size(actualRowIndices), + internal::index_list_size(actualColIndices)); } // The following overload returns a Scalar @@ -168,7 +168,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) typename IvcType::type actualIndices = ivcSize(indices); return VectorBlock::value> - (derived(), internal::first(actualIndices), internal::size(actualIndices)); + (derived(), internal::first(actualIndices), internal::index_list_size(actualIndices)); } template diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h index f1084abefbcdcb8368683fe01a4f01c71cc85621..514d83a71de96e3855b6a060a0300e3f1f54d29c 100644 --- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h +++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h @@ -39,10 +39,10 @@ cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise != operator of *this and \a other @@ -59,10 +59,10 @@ cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const */ template EIGEN_DEVICE_FUNC -inline const CwiseBinaryOp, const Derived, const OtherDerived> +inline const CwiseBinaryOp, const Derived, const OtherDerived> cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); } /** \returns an expression of the coefficient-wise min of *this and \a other @@ -72,23 +72,39 @@ cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const * * \sa class CwiseBinaryOp, max() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return cwiseMin(other); } /** \returns an expression of the coefficient-wise min of *this and scalar \a other * * \sa class CwiseBinaryOp, min() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMin(const Scalar &other) const +{ + return cwiseMin(Derived::Constant(rows(), cols(), other)); +} + EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> cwiseMin(const Scalar &other) const { - return cwiseMin(Derived::Constant(rows(), cols(), other)); + return cwiseMin(Derived::Constant(rows(), cols(), other)); } /** \returns an expression of the coefficient-wise max of *this and \a other @@ -98,23 +114,39 @@ cwiseMin(const Scalar &other) const * * \sa class CwiseBinaryOp, min() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const +{ + return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); +} + template EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const OtherDerived> cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS &other) const { - return CwiseBinaryOp, const Derived, const OtherDerived>(derived(), other.derived()); + return cwiseMax(other); } /** \returns an expression of the coefficient-wise max of *this and scalar \a other * * \sa class CwiseBinaryOp, min() */ +template +EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +cwiseMax(const Scalar &other) const +{ + return cwiseMax(Derived::Constant(rows(), cols(), other)); +} + EIGEN_DEVICE_FUNC -EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> +EIGEN_STRONG_INLINE const CwiseBinaryOp, const Derived, const ConstantReturnType> cwiseMax(const Scalar &other) const { - return cwiseMax(Derived::Constant(rows(), cols(), other)); + return cwiseMax(Derived::Constant(rows(), cols(), other)); } diff --git a/bench/basicbenchmark.h b/bench/basicbenchmark.h index 3fdc35732f02010997e1bcfd745854a60cdf69be..8059375b5e1fde71a28583b2c62f667f5cb84154 100644 --- a/bench/basicbenchmark.h +++ b/bench/basicbenchmark.h @@ -16,13 +16,13 @@ void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations) { asm("#begin_bench_loop LazyEval"); if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize"); - m = (I + 0.00005 * (m + m.lazy() * m)).eval(); + m = (I + 0.00005 * (m + m.lazyProduct(m))).eval(); } else if (Mode==OmpEval) { asm("#begin_bench_loop OmpEval"); if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize"); - m = (I + 0.00005 * (m + m.lazy() * m)).evalOMP(); + m = (I + 0.00005 * (m + m.lazyProduct(m))).eval(); } else { diff --git a/bench/btl/libs/STL/STL_interface.hh b/bench/btl/libs/STL/STL_interface.hh index 16658c4baa1768e6de86513dde0badaf102e3b9c..5b391c6ef59d0781cf418d9f49e763d6ba69b0b6 100644 --- a/bench/btl/libs/STL/STL_interface.hh +++ b/bench/btl/libs/STL/STL_interface.hh @@ -84,9 +84,12 @@ public : for (int j=0;j=j) + { for (int k=0;k suite(device, N); \ cudaDeviceSynchronize(); \ @@ -40,7 +40,7 @@ BM_FuncGPU(fullReduction); #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ @@ -59,7 +59,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64); #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu index 76d68c5c1b80cc7fa1b333417587bbf94a76b335..c778102645192b475fda75c81a39bac88617c49e 100644 --- a/bench/tensors/tensor_benchmarks_gpu.cu +++ b/bench/tensors/tensor_benchmarks_gpu.cu @@ -10,7 +10,7 @@ #define BM_FuncGPU(FUNC) \ static void BM_##FUNC(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ @@ -40,7 +40,7 @@ BM_FuncGPU(fullReduction); #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3) \ static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, D1, D2, D3); \ cudaDeviceSynchronize(); \ @@ -59,7 +59,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64); #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2) \ static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) { \ StopBenchmarkTiming(); \ - Eigen::CudaStreamDevice stream; \ + Eigen::GpuStreamDevice stream; \ Eigen::GpuDevice device(&stream); \ BenchmarkSuite suite(device, N); \ cudaDeviceSynchronize(); \ diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt index 545bc989c65196c22a3847698436d6deaf5e121d..c530957fbdecda8b21ae67ebc1b50e873fffc846 100644 --- a/blas/CMakeLists.txt +++ b/blas/CMakeLists.txt @@ -1,6 +1,7 @@ project(EigenBlas CXX) +if(EIGEN_BUILD_BLAS) include(CheckLanguage) check_language(Fortran) if(CMAKE_Fortran_COMPILER) @@ -26,24 +27,31 @@ else() set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c) endif() +set(EIGEN_BLAS_TARGETS "") + add_library(eigen_blas_static ${EigenBlas_SRCS}) -add_library(eigen_blas SHARED ${EigenBlas_SRCS}) +list(APPEND EIGEN_BLAS_TARGETS eigen_blas_static) -if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) - target_link_libraries(eigen_blas_static ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) - target_link_libraries(eigen_blas ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) +if (EIGEN_BUILD_SHARED_LIBS) + add_library(eigen_blas SHARED ${EigenBlas_SRCS}) + list(APPEND EIGEN_BLAS_TARGETS eigen_blas) endif() -add_dependencies(blas eigen_blas eigen_blas_static) +foreach(target IN LISTS EIGEN_BLAS_TARGETS) + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) + endif() -install(TARGETS eigen_blas eigen_blas_static - RUNTIME DESTINATION bin - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) + add_dependencies(blas ${target}) + install(TARGETS ${target} + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) +endforeach() if(EIGEN_Fortran_COMPILER_WORKS) -if(BUILD_TESTING) +if(EIGEN_BUILD_TESTING) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) add_subdirectory(testing) # can't do EXCLUDE_FROM_ALL here, breaks CTest else() @@ -52,4 +60,4 @@ if(BUILD_TESTING) endif() endif() - +endif() diff --git a/blas/level3_impl.h b/blas/level3_impl.h index 6dd6338b43236b463ebc8cf18c6904795957a8ad..66216c96451159fb959e590b631d6701a1e45743 100644 --- a/blas/level3_impl.h +++ b/blas/level3_impl.h @@ -362,18 +362,18 @@ int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, DenseIndex, const Scalar&, internal::level3_blocking&); static const functype func[8] = { // array index: NOTR | (UP << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: TR | (UP << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: ADJ | (UP << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), 0, // array index: NOTR | (LO << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: TR | (LO << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), // array index: ADJ | (LO << 2) - (internal::general_matrix_matrix_triangular_product::run), + (internal::general_matrix_matrix_triangular_product::run), 0 }; #endif diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake index 9cb3bb20bbcc56fa206e0714fd832d66227d7a18..add6aab53e08ff717bf46fcaa8fabb4b45baa83a 100644 --- a/cmake/EigenConfigureTesting.cmake +++ b/cmake/EigenConfigureTesting.cmake @@ -11,6 +11,15 @@ add_custom_target(buildtests) add_custom_target(check COMMAND "ctest") add_dependencies(check buildtests) +# Convenience target for only building GPU tests. +add_custom_target(buildtests_gpu) +add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure" + "--no-compress-output" + "--build-no-clean" + "-T" "test" + "-L" "gpu") +add_dependencies(check_gpu buildtests_gpu) + # check whether /bin/bash exists (disabled as not used anymore) # find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH) diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index 0808446d6e11b5f45429e25719efacbea9de29d1..995354f059269619063cec58b408f0427feca66c 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -23,7 +23,9 @@ macro(ei_add_test_internal testname testname_with_suffix) set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}${targetname}\n") set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}") + set(is_gpu_test OFF) if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu) + set(is_gpu_test ON) if(EIGEN_TEST_HIP) hip_reset_flags() hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}") @@ -57,10 +59,10 @@ macro(ei_add_test_internal testname testname_with_suffix) add_executable(${targetname} ${filename}) endif() - if (targetname MATCHES "^eigen2_") - add_dependencies(eigen2_buildtests ${targetname}) - else() - add_dependencies(buildtests ${targetname}) + add_dependencies(buildtests ${targetname}) + + if (is_gpu_test) + add_dependencies(buildtests_gpu ${targetname}) endif() if(EIGEN_NO_ASSERTION_CHECKING) @@ -83,7 +85,7 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() if(EIGEN_TEST_CUSTOM_CXX_FLAGS) - ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}") + ei_add_target_property(${targetname} COMPILE_FLAGS ${EIGEN_TEST_CUSTOM_CXX_FLAGS}) endif() if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) @@ -118,6 +120,11 @@ macro(ei_add_test_internal testname testname_with_suffix) add_dependencies("Build${current_subproject}" ${targetname}) set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}") endif() + if (is_gpu_test) + # Add gpu tag for testing only GPU tests. + set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu") + endif() + if(EIGEN_SYCL) # Force include of the SYCL file at the end to avoid errors. set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1) @@ -478,6 +485,7 @@ macro(ei_get_compilerver VAR) execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${EIGEN_CXX_FLAG_VERSION} OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "^[ \n\r]+" "" eigen_cxx_compiler_version_string ${eigen_cxx_compiler_version_string}) string(REGEX REPLACE "[\n\r].*" "" eigen_cxx_compiler_version_string ${eigen_cxx_compiler_version_string}) ei_get_compilerver_from_cxx_version_string("${eigen_cxx_compiler_version_string}" CNAME CVER) @@ -487,9 +495,10 @@ macro(ei_get_compilerver VAR) endmacro() # Extract compiler name and version from a raw version string -# WARNING: if you edit thid macro, then please test it by uncommenting +# WARNING: if you edit this macro, then please test it by uncommenting # the testing macro call in ei_init_testing() of the EigenTesting.cmake file. -# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end of the file +# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end +# of the file macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER) # extract possible compiler names string(REGEX MATCH "g\\+\\+" ei_has_gpp ${VERSTRING}) @@ -497,6 +506,7 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER) string(REGEX MATCH "gcc|GCC" ei_has_gcc ${VERSTRING}) string(REGEX MATCH "icpc|ICC" ei_has_icpc ${VERSTRING}) string(REGEX MATCH "clang|CLANG" ei_has_clang ${VERSTRING}) + string(REGEX MATCH "mingw32" ei_has_mingw ${VERSTRING}) # combine them if((ei_has_llvm) AND (ei_has_gpp OR ei_has_gcc)) @@ -505,6 +515,8 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER) set(${CNAME} "llvm-clang++") elseif(ei_has_clang) set(${CNAME} "clang++") + elseif ((ei_has_mingw) AND (ei_has_gpp OR ei_has_gcc)) + set(${CNAME} "mingw32-g++") elseif(ei_has_icpc) set(${CNAME} "icpc") elseif(ei_has_gpp OR ei_has_gcc) @@ -525,11 +537,17 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER) if(NOT eicver) # try to extract 2: string(REGEX MATCH "[^0-9][0-9]+\\.[0-9]+" eicver ${VERSTRING}) - else() - set(eicver " _") + if (NOT eicver AND ei_has_mingw) + # try to extract 1 number plus suffix: + string(REGEX MATCH "[^0-9][0-9]+-win32" eicver ${VERSTRING}) + endif() endif() endif() endif() + + if (NOT eicver) + set(eicver " _") + endif() string(REGEX REPLACE ".(.*)" "\\1" ${CVER} ${eicver}) @@ -654,6 +672,7 @@ macro(ei_test_get_compilerver_from_cxx_version_string) ei_test1_get_compilerver_from_cxx_version_string("i686-apple-darwin11-llvm-g++-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2335.15.00)" "llvm-g++" "4.2.1") ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 4.4.6" "g++" "4.4.6") ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 2011" "g++" "4.4") + ei_test1_get_compilerver_from_cxx_version_string("x86_64-w64-mingw32-g++ (GCC) 10-win32 20210110" "mingw32-g++" "10-win32") endmacro() # Split all tests listed in EIGEN_TESTS_LIST into num_splits many targets @@ -763,8 +782,7 @@ macro(ei_add_smoke_tests smoke_test_list) if ("${test}" IN_LIST EIGEN_SUBTESTS_LIST) add_dependencies("${buildtarget}" "${test}") # Add label smoketest to be able to run smoketests using ctest - get_property(test_labels TEST ${test} PROPERTY LABELS) - set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest") + set_property(TEST ${test} APPEND PROPERTY LABELS "smoketest") endif() endforeach() -endmacro(ei_add_smoke_tests) \ No newline at end of file +endmacro(ei_add_smoke_tests) diff --git a/cmake/FindBLAS.cmake b/cmake/FindBLAS.cmake index 7d1f81b032ea479b697bbd918e483086d2a4ebbf..1bb8f19652c9582dca81662e1bad7fb23a7aea1b 100644 --- a/cmake/FindBLAS.cmake +++ b/cmake/FindBLAS.cmake @@ -147,6 +147,7 @@ mark_as_advanced(BLAS_VERBOSE) include(CheckFunctionExists) include(CheckFortranFunctionExists) +include(CMakeFindDependencyMacro) set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES}) @@ -509,9 +510,9 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All") if (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX) if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED) - find_package(Threads) + find_dependency(Threads) else() - find_package(Threads REQUIRED) + find_dependency(Threads REQUIRED) endif() set(BLAS_SEARCH_LIBS "") diff --git a/cmake/FindBLASEXT.cmake b/cmake/FindBLASEXT.cmake index 0fe7fb84931bacf4f7879be412c6abf37c44d175..69a941897ef23c7c77d72fa41fc969e3c320b0df 100644 --- a/cmake/FindBLASEXT.cmake +++ b/cmake/FindBLASEXT.cmake @@ -41,18 +41,19 @@ # License text for the above reference.) # macro to factorize this call +include(CMakeFindDependencyMacro) macro(find_package_blas) if(BLASEXT_FIND_REQUIRED) if(BLASEXT_FIND_QUIETLY) - find_package(BLAS REQUIRED QUIET) + find_dependency(BLAS REQUIRED QUIET) else() - find_package(BLAS REQUIRED) + find_dependency(BLAS REQUIRED) endif() else() if(BLASEXT_FIND_QUIETLY) - find_package(BLAS QUIET) + find_dependency(BLAS QUIET) else() - find_package(BLAS) + find_dependency(BLAS) endif() endif() endmacro() @@ -316,7 +317,7 @@ if(BLA_VENDOR MATCHES "Intel*") "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS BLAS_INCLUDE_DIRS) @@ -324,14 +325,14 @@ if(BLA_VENDOR MATCHES "Intel*") if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_PAR_LIBRARIES) endif() else() if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS BLAS_INCLUDE_DIRS) @@ -343,14 +344,14 @@ elseif(BLA_VENDOR MATCHES "ACML*") "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS) if(BLAS_PAR_LIBRARIES) if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_PAR_LIBRARIES) endif() elseif(BLA_VENDOR MATCHES "IBMESSL*") @@ -360,21 +361,24 @@ elseif(BLA_VENDOR MATCHES "IBMESSL*") "\n (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)") message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS) if(BLAS_PAR_LIBRARIES) if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_PAR_LIBRARIES) endif() else() if(NOT BLASEXT_FIND_QUIETLY) message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES") endif() - find_package_handle_standard_args(BLAS DEFAULT_MSG + find_package_handle_standard_args(BLASEXT DEFAULT_MSG BLAS_SEQ_LIBRARIES BLAS_LIBRARY_DIRS) endif() + +# Callers expect BLAS_FOUND to be set as well. +set(BLAS_FOUND BLASEXT_FOUND) diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake index 3cca5150e8fb88927940ad6a77bbb81163917f46..1c271f0fecba7fcd510176040525088330c341f9 100644 --- a/cmake/FindComputeCpp.cmake +++ b/cmake/FindComputeCpp.cmake @@ -41,7 +41,8 @@ set(COMPUTECPP_BITCODE "spir64" CACHE STRING "Bitcode type to use as SYCL target in compute++") mark_as_advanced(COMPUTECPP_BITCODE) -find_package(OpenCL REQUIRED) +include(CMakeFindDependencyMacro) +find_dependency(OpenCL REQUIRED) # Find ComputeCpp package diff --git a/cmake/FindFFTW.cmake b/cmake/FindFFTW.cmake index fad476d0da01ed2a4b2cb63c0481dcdca2b7768a..ed55c5fad6baa31628a2327541a03c1701055566 100644 --- a/cmake/FindFFTW.cmake +++ b/cmake/FindFFTW.cmake @@ -22,7 +22,8 @@ if( NOT FFTW_ROOT AND ENV{FFTWDIR} ) endif() # Check if we can use PkgConfig -find_package(PkgConfig) +include(CMakeFindDependencyMacro) +find_dependency(PkgConfig) #Determine from PKG if( PKG_CONFIG_FOUND AND NOT FFTW_ROOT ) diff --git a/cmake/FindHWLOC.cmake b/cmake/FindHWLOC.cmake index 48329151892af498dc13871758069b4b1047b2fb..522f5215795ab937f1a815d5b9c1cf046138a265 100644 --- a/cmake/FindHWLOC.cmake +++ b/cmake/FindHWLOC.cmake @@ -65,8 +65,9 @@ endif() # Optionally use pkg-config to detect include/library dirs (if pkg-config is available) # ------------------------------------------------------------------------------------- -include(FindPkgConfig) -find_package(PkgConfig QUIET) +include(CMakeFindDependencyMacro) +# include(FindPkgConfig) +find_dependency(PkgConfig QUIET) if( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER ) pkg_search_module(HWLOC hwloc) diff --git a/cmake/FindLAPACK.cmake b/cmake/FindLAPACK.cmake index 284a4529c7a4df4d255657e242a07f443802359c..3fd738807090183de5f57ac2466582cc7126c4a2 100644 --- a/cmake/FindLAPACK.cmake +++ b/cmake/FindLAPACK.cmake @@ -26,6 +26,7 @@ include(CheckFunctionExists) +include(CMakeFindDependencyMacro) # This macro checks for the existence of the combination of fortran libraries # given by _list. If the combination is found, this macro checks (using the @@ -88,7 +89,7 @@ macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _b set(${LIBRARIES} ${_libraries_found}) # Some C++ linkers require the f2c library to link with Fortran libraries. # I do not know which ones, thus I just add the f2c library if it is available. - find_package( F2C QUIET ) + find_dependency( F2C QUIET ) if ( F2C_FOUND ) set(${DEFINITIONS} ${${DEFINITIONS}} ${F2C_DEFINITIONS}) set(${LIBRARIES} ${${LIBRARIES}} ${F2C_LIBRARIES}) @@ -135,9 +136,9 @@ endmacro() # LAPACK requires BLAS if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED) - find_package(BLAS) + find_dependency(BLAS) else() - find_package(BLAS REQUIRED) + find_dependency(BLAS REQUIRED) endif() if (NOT BLAS_FOUND) diff --git a/cmake/FindMPREAL.cmake b/cmake/FindMPREAL.cmake new file mode 100644 index 0000000000000000000000000000000000000000..947a1ce88678ddb4ffd7f49d38ac30e731afb332 --- /dev/null +++ b/cmake/FindMPREAL.cmake @@ -0,0 +1,103 @@ +# Try to find the MPFR C++ (MPREAL) library +# See http://www.holoborodko.com/pavel/mpreal/ +# +# This module supports requiring a minimum version, e.g. you can do +# find_package(MPREAL 1.8.6) +# to require version 1.8.6 or newer of MPREAL C++. +# +# Once done this will define +# +# MPREAL_FOUND - system has MPREAL lib with correct version +# MPREAL_INCLUDES - MPREAL required include directories +# MPREAL_LIBRARIES - MPREAL required libraries +# MPREAL_VERSION - MPREAL version + +# Copyright (c) 2020 The Eigen Authors. +# Redistribution and use is allowed according to the terms of the BSD license. + +include(CMakeFindDependencyMacro) +find_dependency(MPFR) +find_dependency(GMP) + +# Set MPREAL_INCLUDES +find_path(MPREAL_INCLUDES + NAMES + mpreal.h + PATHS + $ENV{GMPDIR} + ${INCLUDE_INSTALL_DIR} +) + +# Set MPREAL_FIND_VERSION to 1.0.0 if no minimum version is specified + +if(NOT MPREAL_FIND_VERSION) + if(NOT MPREAL_FIND_VERSION_MAJOR) + set(MPREAL_FIND_VERSION_MAJOR 1) + endif() + if(NOT MPREAL_FIND_VERSION_MINOR) + set(MPREAL_FIND_VERSION_MINOR 0) + endif() + if(NOT MPREAL_FIND_VERSION_PATCH) + set(MPREAL_FIND_VERSION_PATCH 0) + endif() + + set(MPREAL_FIND_VERSION "${MPREAL_FIND_VERSION_MAJOR}.${MPREAL_FIND_VERSION_MINOR}.${MPREAL_FIND_VERSION_PATCH}") +endif() + +# Check bugs +# - https://github.com/advanpix/mpreal/issues/7 +# - https://github.com/advanpix/mpreal/issues/9 +set(MPREAL_TEST_PROGRAM " +#include +#include +int main(int argc, char** argv) { + const mpfr::mpreal one = 1.0; + const mpfr::mpreal zero = 0.0; + using namespace std; + const mpfr::mpreal smaller = min(one, zero); + return 0; +}") + +if(MPREAL_INCLUDES) + + # Set MPREAL_VERSION + + file(READ "${MPREAL_INCLUDES}/mpreal.h" _mpreal_version_header) + + string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MAJOR[ \t]+([0-9]+)" _mpreal_major_version_match "${_mpreal_version_header}") + set(MPREAL_MAJOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MINOR[ \t]+([0-9]+)" _mpreal_minor_version_match "${_mpreal_version_header}") + set(MPREAL_MINOR_VERSION "${CMAKE_MATCH_1}") + string(REGEX MATCH "define[ \t]+MPREAL_VERSION_PATCHLEVEL[ \t]+([0-9]+)" _mpreal_patchlevel_version_match "${_mpreal_version_header}") + set(MPREAL_PATCHLEVEL_VERSION "${CMAKE_MATCH_1}") + + set(MPREAL_VERSION ${MPREAL_MAJOR_VERSION}.${MPREAL_MINOR_VERSION}.${MPREAL_PATCHLEVEL_VERSION}) + + # Check whether found version exceeds minimum version + + if(${MPREAL_VERSION} VERSION_LESS ${MPREAL_FIND_VERSION}) + set(MPREAL_VERSION_OK FALSE) + message(STATUS "MPREAL version ${MPREAL_VERSION} found in ${MPREAL_INCLUDES}, " + "but at least version ${MPREAL_FIND_VERSION} is required") + else() + set(MPREAL_VERSION_OK TRUE) + + list(APPEND MPREAL_INCLUDES "${MPFR_INCLUDES}" "${GMP_INCLUDES}") + list(REMOVE_DUPLICATES MPREAL_INCLUDES) + + list(APPEND MPREAL_LIBRARIES "${MPFR_LIBRARIES}" "${GMP_LIBRARIES}") + list(REMOVE_DUPLICATES MPREAL_LIBRARIES) + + # Make sure it compiles with the current compiler. + unset(MPREAL_WORKS CACHE) + include(CheckCXXSourceCompiles) + set(CMAKE_REQUIRED_INCLUDES "${MPREAL_INCLUDES}") + set(CMAKE_REQUIRED_LIBRARIES "${MPREAL_LIBRARIES}") + check_cxx_source_compiles("${MPREAL_TEST_PROGRAM}" MPREAL_WORKS) + endif() +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(MPREAL DEFAULT_MSG + MPREAL_INCLUDES MPREAL_VERSION_OK MPREAL_WORKS) +mark_as_advanced(MPREAL_INCLUDES) diff --git a/cmake/FindPastix.cmake b/cmake/FindPASTIX.cmake similarity index 96% rename from cmake/FindPastix.cmake rename to cmake/FindPASTIX.cmake index 3b47d5ce33bce84374148c4090e3fb14cd666cbd..db1427b0a4b2ee16452aa4110c2a2bae36e62ab0 100644 --- a/cmake/FindPastix.cmake +++ b/cmake/FindPASTIX.cmake @@ -118,7 +118,7 @@ if( PASTIX_FIND_COMPONENTS ) if (${component} STREQUAL "SCOTCH") set(PASTIX_LOOK_FOR_SCOTCH ON) endif() - if (${component} STREQUAL "SCOTCH") + if (${component} STREQUAL "PTSCOTCH") set(PASTIX_LOOK_FOR_PTSCOTCH ON) endif() if (${component} STREQUAL "METIS") @@ -133,14 +133,14 @@ endif() # Required dependencies # --------------------- - +include(CMakeFindDependencyMacro) if (NOT PASTIX_FIND_QUIETLY) message(STATUS "Looking for PASTIX - Try to detect pthread") endif() if (PASTIX_FIND_REQUIRED) - find_package(Threads REQUIRED QUIET) + find_dependency(Threads REQUIRED QUIET) else() - find_package(Threads QUIET) + find_dependency(Threads QUIET) endif() set(PASTIX_EXTRA_LIBRARIES "") if( THREADS_FOUND ) @@ -198,9 +198,9 @@ if (NOT PASTIX_FIND_QUIETLY) message(STATUS "Looking for PASTIX - Try to detect HWLOC") endif() if (PASTIX_FIND_REQUIRED) - find_package(HWLOC REQUIRED QUIET) + find_dependency(HWLOC REQUIRED QUIET) else() - find_package(HWLOC QUIET) + find_dependency(HWLOC QUIET) endif() # PASTIX depends on BLAS @@ -209,9 +209,9 @@ if (NOT PASTIX_FIND_QUIETLY) message(STATUS "Looking for PASTIX - Try to detect BLAS") endif() if (PASTIX_FIND_REQUIRED) - find_package(BLASEXT REQUIRED QUIET) + find_dependency(BLASEXT REQUIRED QUIET) else() - find_package(BLASEXT QUIET) + find_dependency(BLASEXT QUIET) endif() # Optional dependencies @@ -230,9 +230,9 @@ if (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI) set(MPI_C_COMPILER mpicc) endif() if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_MPI) - find_package(MPI REQUIRED QUIET) + find_dependency(MPI REQUIRED QUIET) else() - find_package(MPI QUIET) + find_dependency(MPI QUIET) endif() if (MPI_FOUND) mark_as_advanced(MPI_LIBRARY) @@ -272,10 +272,10 @@ if( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU) endif() # set the list of optional dependencies we may discover if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_STARPU) - find_package(STARPU ${PASTIX_STARPU_VERSION} REQUIRED + find_dependency(STARPU ${PASTIX_STARPU_VERSION} REQUIRED COMPONENTS ${STARPU_COMPONENT_LIST}) else() - find_package(STARPU ${PASTIX_STARPU_VERSION} + find_dependency(STARPU ${PASTIX_STARPU_VERSION} COMPONENTS ${STARPU_COMPONENT_LIST}) endif() @@ -288,9 +288,9 @@ if (NOT SCOTCH_FOUND AND PASTIX_LOOK_FOR_SCOTCH) message(STATUS "Looking for PASTIX - Try to detect SCOTCH") endif() if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_SCOTCH) - find_package(SCOTCH REQUIRED QUIET) + find_dependency(SCOTCH REQUIRED QUIET) else() - find_package(SCOTCH QUIET) + find_dependency(SCOTCH QUIET) endif() endif() @@ -301,9 +301,9 @@ if (NOT PTSCOTCH_FOUND AND PASTIX_LOOK_FOR_PTSCOTCH) message(STATUS "Looking for PASTIX - Try to detect PTSCOTCH") endif() if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_PTSCOTCH) - find_package(PTSCOTCH REQUIRED QUIET) + find_dependency(PTSCOTCH REQUIRED QUIET) else() - find_package(PTSCOTCH QUIET) + find_dependency(PTSCOTCH QUIET) endif() endif() @@ -314,9 +314,9 @@ if (NOT METIS_FOUND AND PASTIX_LOOK_FOR_METIS) message(STATUS "Looking for PASTIX - Try to detect METIS") endif() if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_METIS) - find_package(METIS REQUIRED QUIET) + find_dependency(METIS REQUIRED QUIET) else() - find_package(METIS QUIET) + find_dependency(METIS QUIET) endif() endif() diff --git a/cmake/FindPTSCOTCH.cmake b/cmake/FindPTSCOTCH.cmake index 51eecf1af866e16c50eb7866d3e1ff469dcc940d..6ccc743e68cc5711fce90bf74ccb611cebd4759d 100644 --- a/cmake/FindPTSCOTCH.cmake +++ b/cmake/FindPTSCOTCH.cmake @@ -79,20 +79,21 @@ if( PTSCOTCH_FIND_COMPONENTS ) endif() # PTSCOTCH depends on Threads, try to find it +include(CMakeFindDependencyMacro) if (NOT THREADS_FOUND) if (PTSCOTCH_FIND_REQUIRED) - find_package(Threads REQUIRED) + find_dependency(Threads REQUIRED) else() - find_package(Threads) + find_dependency(Threads) endif() endif() # PTSCOTCH depends on MPI, try to find it if (NOT MPI_FOUND) if (PTSCOTCH_FIND_REQUIRED) - find_package(MPI REQUIRED) + find_dependency(MPI REQUIRED) else() - find_package(MPI) + find_dependency(MPI) endif() endif() @@ -148,18 +149,18 @@ else() foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") find_path(PTSCOTCH_${ptscotch_hdr}_DIRS - NAMES ${ptscotch_hdr} - HINTS ${PTSCOTCH_DIR} - PATH_SUFFIXES "include" "include/scotch") + NAMES ${ptscotch_hdr} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES "include" "include/scotch") mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) endforeach() else() foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND") find_path(PTSCOTCH_${ptscotch_hdr}_DIRS - NAMES ${ptscotch_hdr} - HINTS ${_inc_env} - PATH_SUFFIXES "scotch") + NAMES ${ptscotch_hdr} + HINTS ${_inc_env} + PATH_SUFFIXES "scotch") mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS) endforeach() endif() @@ -171,7 +172,6 @@ foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find}) if (PTSCOTCH_${ptscotch_hdr}_DIRS) list(APPEND PTSCOTCH_INCLUDE_DIRS "${PTSCOTCH_${ptscotch_hdr}_DIRS}") else () - set(PTSCOTCH_INCLUDE_DIRS "PTSCOTCH_INCLUDE_DIRS-NOTFOUND") if (NOT PTSCOTCH_FIND_QUIETLY) message(STATUS "Looking for ptscotch -- ${ptscotch_hdr} not found") endif() @@ -229,16 +229,16 @@ else() foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY - NAMES ${ptscotch_lib} - HINTS ${PTSCOTCH_DIR} - PATH_SUFFIXES lib lib32 lib64) + NAMES ${ptscotch_lib} + HINTS ${PTSCOTCH_DIR} + PATH_SUFFIXES lib lib32 lib64) endforeach() else() foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND") find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY - NAMES ${ptscotch_lib} - HINTS ${_lib_env}) + NAMES ${ptscotch_lib} + HINTS ${_lib_env}) endforeach() endif() endif() @@ -255,7 +255,6 @@ foreach(ptscotch_lib ${PTSCOTCH_libs_to_find}) list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}") list(APPEND PTSCOTCH_LIBRARY_DIRS "${${ptscotch_lib}_lib_path}") else () - list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}") if (NOT PTSCOTCH_FIND_QUIETLY) message(STATUS "Looking for ptscotch -- lib ${ptscotch_lib} not found") endif() diff --git a/cmake/FindScotch.cmake b/cmake/FindSCOTCH.cmake similarity index 99% rename from cmake/FindScotch.cmake rename to cmake/FindSCOTCH.cmake index af00eb0f2565868e5470739cf662ce664347a2aa..11b971a926443aa415fb28759f6aa2f04f703e41 100644 --- a/cmake/FindScotch.cmake +++ b/cmake/FindSCOTCH.cmake @@ -71,11 +71,12 @@ if( SCOTCH_FIND_COMPONENTS ) endif() # SCOTCH may depend on Threads, try to find it +include(CMakeFindDependencyMacro) if (NOT THREADS_FOUND) if (SCOTCH_FIND_REQUIRED) - find_package(Threads REQUIRED) + find_dependency(Threads REQUIRED) else() - find_package(Threads) + find_dependency(Threads) endif() endif() diff --git a/cmake/FindTriSYCL.cmake b/cmake/FindTriSYCL.cmake index 41bc2fa894b29b55de903af46ddf640333a9793b..81042390729788ca8a644e52ba901af74d355120 100644 --- a/cmake/FindTriSYCL.cmake +++ b/cmake/FindTriSYCL.cmake @@ -57,18 +57,19 @@ mark_as_advanced(TRISYCL_DEBUG_STRUCTORS) mark_as_advanced(TRISYCL_TRACE_KERNEL) #triSYCL definitions -set(CL_SYCL_LANGUAGE_VERSION 220 CACHE VERSION +set(CL_SYCL_LANGUAGE_VERSION 220 CACHE STRING "Host language version to be used by trisYCL (default is: 220)") -set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE VERSION +set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE STRING "Device language version to be used by trisYCL (default is: 220)") -#set(TRISYCL_COMPILE_OPTIONS "-std=c++1z -Wall -Wextra") -set(CMAKE_CXX_STANDARD 14) +# triSYCL now requires c++17 +set(CMAKE_CXX_STANDARD 17) set(CXX_STANDARD_REQUIRED ON) # Find OpenCL package +include(CMakeFindDependencyMacro) if(TRISYCL_OPENCL) - find_package(OpenCL REQUIRED) + find_dependency(OpenCL REQUIRED) if(UNIX) set(BOOST_COMPUTE_INCPATH /usr/include/compute CACHE PATH "Path to Boost.Compute headers (default is: /usr/include/compute)") @@ -77,11 +78,11 @@ endif() # Find OpenMP package if(TRISYCL_OPENMP) - find_package(OpenMP REQUIRED) + find_dependency(OpenMP REQUIRED) endif() # Find Boost -find_package(Boost 1.58 REQUIRED COMPONENTS chrono log) +find_dependency(Boost 1.58 REQUIRED COMPONENTS chrono log) # If debug or trace we need boost log if(TRISYCL_DEBUG OR TRISYCL_DEBUG_STRUCTORS OR TRISYCL_TRACE_KERNEL) @@ -90,9 +91,23 @@ else() set(LOG_NEEDED OFF) endif() -find_package(Threads REQUIRED) +find_dependency(Threads REQUIRED) # Find triSYCL directory +if (TRISYCL_INCLUDES AND TRISYCL_LIBRARIES) + set(TRISYCL_FIND_QUIETLY TRUE) +endif () + +find_path(TRISYCL_INCLUDE_DIR + NAMES sycl.hpp + PATHS $ENV{TRISYCLDIR} $ENV{TRISYCLDIR}/include ${INCLUDE_INSTALL_DIR} + PATH_SUFFIXES triSYCL +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(TriSYCL DEFAULT_MSG + TRISYCL_INCLUDE_DIR) + if(NOT TRISYCL_INCLUDE_DIR) message(FATAL_ERROR "triSYCL include directory - Not found! (please set TRISYCL_INCLUDE_DIR") @@ -100,36 +115,42 @@ else() message(STATUS "triSYCL include directory - Found ${TRISYCL_INCLUDE_DIR}") endif() +include(CMakeParseArguments) ####################### # add_sycl_to_target ####################### -# -# Sets the proper flags and includes for the target compilation. -# -# targetName : Name of the target to add a SYCL to. -# sourceFile : Source file to be compiled for SYCL. -# binaryDir : Intermediate directory to output the integration header. -# -function(add_sycl_to_target targetName sourceFile binaryDir) +function(add_sycl_to_target) + set(options) + set(one_value_args + TARGET + ) + set(multi_value_args + SOURCES + ) + cmake_parse_arguments(ADD_SYCL_ARGS + "${options}" + "${one_value_args}" + "${multi_value_args}" + ${ARGN} + ) # Add include directories to the "#include <>" paths - target_include_directories (${targetName} PUBLIC + target_include_directories (${ADD_SYCL_ARGS_TARGET} PUBLIC ${TRISYCL_INCLUDE_DIR} ${Boost_INCLUDE_DIRS} $<$:${OpenCL_INCLUDE_DIRS}> $<$:${BOOST_COMPUTE_INCPATH}>) - # Link dependencies - target_link_libraries(${targetName} PUBLIC + target_link_libraries(${ADD_SYCL_ARGS_TARGET} $<$:${OpenCL_LIBRARIES}> Threads::Threads $<$:Boost::log> Boost::chrono) - # Compile definitions - target_compile_definitions(${targetName} PUBLIC + target_compile_definitions(${ADD_SYCL_ARGS_TARGET} PUBLIC + EIGEN_SYCL_TRISYCL $<$:TRISYCL_NO_ASYNC> $<$:TRISYCL_OPENCL> $<$:TRISYCL_DEBUG> @@ -138,13 +159,13 @@ function(add_sycl_to_target targetName sourceFile binaryDir) $<$:BOOST_LOG_DYN_LINK>) # C++ and OpenMP requirements - target_compile_options(${targetName} PUBLIC + target_compile_options(${ADD_SYCL_ARGS_TARGET} PUBLIC ${TRISYCL_COMPILE_OPTIONS} $<$:${OpenMP_CXX_FLAGS}>) if(${TRISYCL_OPENMP} AND (NOT WIN32)) # Does not support generator expressions - set_target_properties(${targetName} + set_target_properties(${ADD_SYCL_ARGS_TARGET} PROPERTIES LINK_FLAGS ${OpenMP_CXX_FLAGS}) endif() diff --git a/doc/CustomizingEigen_Plugins.dox b/doc/CustomizingEigen_Plugins.dox index d88f2409b955f8b03884459366276bc92b42f388..9ab0200ff946dc1be9e0d6465ddff920a83a2314 100644 --- a/doc/CustomizingEigen_Plugins.dox +++ b/doc/CustomizingEigen_Plugins.dox @@ -59,7 +59,7 @@ operator+(const Scalar& scalar, const MatrixBase& mat) { return CwiseBinaryOp, const ConstantReturnType, Derived>(Constant(rows(),cols(),scalar), mat.derived()); } \endcode -Then one can the following declaration in the config.h or whatever prerequisites header file of his project: +Then one can add the following declaration in the config.h or whatever prerequisites header file of his project: \code #define EIGEN_MATRIXBASE_PLUGIN "MatrixBaseAddons.h" \endcode diff --git a/doc/FunctionsTakingEigenTypes.dox b/doc/FunctionsTakingEigenTypes.dox index 6b4e49214c295cb67eb3cf3a01767661684a0765..3e745462cd42747c6f1a47898d7d32b52f1c9e56 100644 --- a/doc/FunctionsTakingEigenTypes.dox +++ b/doc/FunctionsTakingEigenTypes.dox @@ -126,7 +126,7 @@ and contrary to what one might think at first, this implementation is fine unles MatrixXf x,y,z; MatrixXf C = cov(x,y+z); \endcode -In this special case, the example is fine and will be working because both parameters are declared as \e const references. The compiler creates a temporary and evaluates the expression x+z into this temporary. Once the function is processed, the temporary is released and the result is assigned to C. +In this special case, the example is fine and will be working because both parameters are declared as \e const references. The compiler creates a temporary and evaluates the expression y+z into this temporary. Once the function is processed, the temporary is released and the result is assigned to C. \b Note: Functions taking \e const references to Matrix (or Array) can process expressions at the cost of temporaries. diff --git a/doc/LeastSquares.dox b/doc/LeastSquares.dox index 24dfe4b4f47b7f0cef94401820f0598f5016d119..ddbf38dec9b27f7fd5bac0d74efc5af9dfe77ca8 100644 --- a/doc/LeastSquares.dox +++ b/doc/LeastSquares.dox @@ -30,14 +30,17 @@ computing least squares solutions: This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink. +If you just need to solve the least squares problem, but are not interested in the SVD per se, a +faster alternative method is CompleteOrthogonalDecomposition. \section LeastSquaresQR Using the QR decomposition The solve() method in QR decomposition classes also computes the least squares solution. There are -three QR decomposition classes: HouseholderQR (no pivoting, so fast but unstable), -ColPivHouseholderQR (column pivoting, thus a bit slower but more accurate) and FullPivHouseholderQR -(full pivoting, so slowest and most stable). Here is an example with column pivoting: +three QR decomposition classes: HouseholderQR (no pivoting, fast but unstable if your matrix is +not rull rank), ColPivHouseholderQR (column pivoting, thus a bit slower but more stable) and +FullPivHouseholderQR (full pivoting, so slowest and slightly more stable than ColPivHouseholderQR). +Here is an example with column pivoting: @@ -61,9 +64,11 @@ Finding the least squares solution of \a Ax = \a b is equivalent to solving the
Example:Output:
-If the matrix \a A is ill-conditioned, then this is not a good method, because the condition number +This method is usually the fastest, especially when \a A is "tall and skinny". However, if the +matrix \a A is even mildly ill-conditioned, this is not a good method, because the condition number of ATA is the square of the condition number of \a A. This means that you -lose twice as many digits using normal equation than if you use the other methods. +lose roughly twice as many digits of accuracy using the normal equation, compared to the more stable +methods mentioned above. */ diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox index 38754e4afbee9248f94a7cc28cafc9660337f52f..66d3bcd3c6f41c7e50d9febae17f925e014960a9 100644 --- a/doc/SparseLinearSystems.dox +++ b/doc/SparseLinearSystems.dox @@ -13,24 +13,20 @@ They are summarized in the following tables: - + - - - -
ClassSolver kindMatrix kindFeatures related to performanceLicense

Notes

Notes

SimplicialLLT \n \#includeDirect LLt factorizationSPDFill-in reducingLGPL SimplicialLDLT is often preferable
SimplicialLDLT \n \#includeDirect LDLt factorizationSPDFill-in reducingLGPL Recommended for very sparse and not too large problems (e.g., 2D Poisson eq.)
SparseLU \n \#include LU factorization Square Fill-in reducing, Leverage fast dense algebraMPL2 optimized for small and large problems with irregular patterns
SparseQR \n \#include QR factorization Any, rectangular Fill-in reducingMPL2 recommended for least-square problems, has a basic rank-revealing feature
@@ -38,21 +34,18 @@ They are summarized in the following tables: - + - - -
ClassSolver kindMatrix kindSupported preconditioners, [default]License

Notes

Notes

ConjugateGradient \n \#include Classic iterative CGSPD IdentityPreconditioner, [DiagonalPreconditioner], IncompleteCholeskyMPL2 Recommended for large symmetric problems (e.g., 3D Poisson eq.)
LeastSquaresConjugateGradient \n \#includeCG for rectangular least-square problemRectangular IdentityPreconditioner, [LeastSquareDiagonalPreconditioner]MPL2 Solve for min |A'Ax-b|^2 without forming A'A
BiCGSTAB \n \#includeIterative stabilized bi-conjugate gradientSquare IdentityPreconditioner, [DiagonalPreconditioner], IncompleteLUTMPL2 To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.
diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox index 9779f3f9c88bd52e39f45f37a54f582de2ebe2a7..14a5891312853dbcc85e64f7d32cca4f98dc4fdb 100644 --- a/doc/SparseQuickReference.dox +++ b/doc/SparseQuickReference.dox @@ -153,7 +153,7 @@ It is easy to perform arithmetic operations on sparse matrices provided that the \code perm.indices(); // Reference to the vector of indices sm1.twistedBy(perm); // Permute rows and columns -sm2 = sm1 * perm; // Permute the columns +sm2 = sm1 * perm; // Permute the rows sm2 = perm * sm1; // Permute the columns \endcode diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox index 0965da87248933a49f6474aa09c6b03553d21fe1..402b3769e2f3e44e6c145e9d0bdeefe82d9a6462 100644 --- a/doc/TopicLinearAlgebraDecompositions.dox +++ b/doc/TopicLinearAlgebraDecompositions.dox @@ -72,7 +72,7 @@ To get an overview of the true relative speed of the different decompositions, c Orthogonalization Yes Excellent - Soon: blocking + - @@ -88,6 +88,18 @@ To get an overview of the true relative speed of the different decompositions, c + CompleteOrthogonalDecomposition + - + Fast + Good + Yes + Orthogonalization + Yes + Excellent + - + + + LLT Positive definite Very fast @@ -99,7 +111,7 @@ To get an overview of the true relative speed of the different decompositions, c Blocking - + LDLT Positive or negative semidefinite1 Very fast diff --git a/doc/TutorialBlockOperations.dox b/doc/TutorialBlockOperations.dox index a2d8c97cc094f62b0bd64278658272792362295a..df277482c897e23675520e21e0fb07b50ff0cce5 100644 --- a/doc/TutorialBlockOperations.dox +++ b/doc/TutorialBlockOperations.dox @@ -167,6 +167,20 @@ matrix.rightCols(q);\endcode \code matrix.rightCols();\endcode +%Block containing the q columns starting from i + \link DenseBase::middleCols() * \endlink + \code +matrix.middleCols(i,q);\endcode + \code +matrix.middleCols(i);\endcode + +%Block containing the q rows starting from i + \link DenseBase::middleRows() * \endlink + \code +matrix.middleRows(i,q);\endcode + \code +matrix.middleRows(i);\endcode + Here is a simple example illustrating the use of the operations presented above: diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox index a72724143c693fd918268be84e722e093c2c65bf..8042fcad333788acc479a97d1b39809d3bcefd0a 100644 --- a/doc/TutorialLinearAlgebra.dox +++ b/doc/TutorialLinearAlgebra.dox @@ -14,7 +14,7 @@ QR, %SVD, eigendecompositions... After reading this page, don't miss our \f[ Ax \: = \: b \f] Where \a A and \a b are matrices (\a b could be a vector, as a special case). You want to find a solution \a x. -\b The \b solution: You can choose between various decompositions, depending on what your matrix \a A looks like, +\b The \b solution: You can choose between various decompositions, depending on the properties of your matrix \a A, and depending on whether you favor speed or accuracy. However, let's start with an example that works in all cases, and is a good compromise: @@ -34,7 +34,7 @@ Vector3f x = dec.solve(b); Here, ColPivHouseholderQR is a QR decomposition with column pivoting. It's a good compromise for this tutorial, as it works for all matrices while being quite fast. Here is a table of some other decompositions that you can choose from, -depending on your matrix and the trade-off you want to make: +depending on your matrix, the problem you are trying to solve, and the trade-off you want to make:
@@ -128,11 +128,13 @@ depending on your matrix and the trade-off you want to make:
To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink. -All of these decompositions offer a solve() method that works as in the above example. +All of these decompositions offer a solve() method that works as in the above example. -For example, if your matrix is positive definite, the above table says that a very good -choice is then the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general -matrix (not a vector) as right hand side is possible. +If you know more about the properties of your matrix, you can use the above table to select the best method. +For example, a good choice for solving linear systems with a non-symmetric matrix of full rank is PartialPivLU. +If you know that your matrix is also symmetric and positive definite, the above table says that +a very good choice is the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general +matrix (not a vector) as right hand side is possible: @@ -146,7 +148,34 @@ For a \ref TopicLinearAlgebraDecompositions "much more complete table" comparing supports many other decompositions), see our special page on \ref TopicLinearAlgebraDecompositions "this topic". -\section TutorialLinAlgSolutionExists Checking if a solution really exists + +\section TutorialLinAlgLeastsquares Least squares solving + +The most general and accurate method to solve under- or over-determined linear systems +in the least squares sense, is the SVD decomposition. Eigen provides two implementations. +The recommended one is the BDCSVD class, which scales well for large problems +and automatically falls back to the JacobiSVD class for smaller problems. +For both classes, their solve() method solved the linear system in the least-squares +sense. + +Here is an example: +
Example:Output:
+ + + + + +
Example:Output:
\include TutorialLinAlgSVDSolve.cpp \verbinclude TutorialLinAlgSVDSolve.out
+ +An alternative to the SVD, which is usually faster and about as accurate, is CompleteOrthogonalDecomposition. + +Again, if you know more about the problem, the table above contains methods that are potentially faster. +If your matrix is full rank, HouseHolderQR is the method of choice. If your matrix is full rank and well conditioned, +using the Cholesky decomposition (LLT) on the matrix of the normal equations can be faster still. +Our page on \link LeastSquares least squares solving \endlink has more details. + + +\section TutorialLinAlgSolutionExists Checking if a matrix is singular Only you know what error margin you want to allow for a solution to be considered valid. So Eigen lets you do this computation for yourself, if you want to, as in this example: @@ -179,11 +208,11 @@ very rare. The call to info() is to check for this possibility. \section TutorialLinAlgInverse Computing inverse and determinant First of all, make sure that you really want this. While inverse and determinant are fundamental mathematical concepts, -in \em numerical linear algebra they are not as popular as in pure mathematics. Inverse computations are often +in \em numerical linear algebra they are not as useful as in pure mathematics. Inverse computations are often advantageously replaced by solve() operations, and the determinant is often \em not a good way of checking if a matrix is invertible. -However, for \em very \em small matrices, the above is not true, and inverse and determinant can be very useful. +However, for \em very \em small matrices, the above may not be true, and inverse and determinant can be very useful. While certain decompositions, such as PartialPivLU and FullPivLU, offer inverse() and determinant() methods, you can also call inverse() and determinant() directly on a matrix. If your matrix is of a very small fixed size (at most 4x4) this @@ -198,28 +227,6 @@ Here is an example: -\section TutorialLinAlgLeastsquares Least squares solving - -The most accurate method to do least squares solving is with a SVD decomposition. -Eigen provides two implementations. -The recommended one is the BDCSVD class, which scale well for large problems -and automatically fall-back to the JacobiSVD class for smaller problems. -For both classes, their solve() method is doing least-squares solving. - -Here is an example: - - - - - - -
Example:Output:
\include TutorialLinAlgSVDSolve.cpp \verbinclude TutorialLinAlgSVDSolve.out
- -Another methods, potentially faster but less reliable, are to use a Cholesky decomposition of the -normal matrix or a QR decomposition. Our page on \link LeastSquares least squares solving \endlink -has more details. - - \section TutorialLinAlgSeparateComputation Separating the computation from the construction In the above examples, the decomposition was computed at the same time that the decomposition object was constructed. diff --git a/doc/TutorialMatrixClass.dox b/doc/TutorialMatrixClass.dox index 2c452220f433757f0921668064c2f49b0da167ae..9b6b4b1f0ad2a1901005096da3fa44522cc1ac7e 100644 --- a/doc/TutorialMatrixClass.dox +++ b/doc/TutorialMatrixClass.dox @@ -111,9 +111,9 @@ Vector4d c(5.0, 6.0, 7.0, 8.0); If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized by passing an arbitrary number of coefficients: \code -Vector2i a(1, 2); // A column vector containing the elements {1, 2} -Matrix b {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5} -Matrix c = {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5} +Vector2i a(1, 2); // A column-vector containing the elements {1, 2} +Matrix b {1, 2, 3, 4, 5}; // A column-vector containing the elements {1, 2, 3, 4, 5} +Matrix c = {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5} \endcode In the general case of matrices and vectors with either fixed or runtime sizes, diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox index 98ace43e4a5a2cf9e793b0711d3196f1f621638c..60f1edca35269c90065367be384aa7ebb03bc0ad 100644 --- a/doc/TutorialSlicingIndexing.dox +++ b/doc/TutorialSlicingIndexing.dox @@ -14,8 +14,8 @@ In particular, it supports \b slicing that consists in taking a set of rows, col All the aforementioned operations are handled through the generic DenseBase::operator()(const RowIndices&, const ColIndices&) method. Each argument can be: - An integer indexing a single row or column, including symbolic indices. - - The symbol Eigen::all representing the whole set of respective rows or columns in increasing order. - - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::lastN functions. + - The symbol Eigen::placeholders::all representing the whole set of respective rows or columns in increasing order. + - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::placeholders::lastN functions. - Any 1D vector/array of integers including %Eigen's vector/array, expressions, std::vector, std::array, as well as plain C arrays: `int[N]`. More generally, it can accepts any object exposing the following two member functions: @@ -72,7 +72,7 @@ Here are some examples for a 2D array/matrix \c A and a 1D array/vector \c v. %Block starting at \c i,j having \c m rows, and \c n columns - \code A(seqN(i,m), seqN(i,n) \endcode + \code A(seqN(i,m), seqN(i,n)) \endcode \code A.block(i,j,m,n) \endcode @@ -129,12 +129,12 @@ Here comes \link Eigen::lastN(SizeType) Eigen::lastN(size) \endlink, and \link E Bottom-right corner of A of size \c m times \c n - \code v(lastN(m), lastN(n)) \endcode + \code A(lastN(m), lastN(n)) \endcode \code A.bottomRightCorner(m,n) \endcode Bottom-right corner of A of size \c m times \c n - \code v(lastN(m), lastN(n)) \endcode + \code A(lastN(m), lastN(n)) \endcode \code A.bottomRightCorner(m,n) \endcode diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox index 350ea11396dab3a36eae853b916893bdcdaf52f0..4faba418d095a5edeb0c33383fc1d350b8b2d5c5 100644 --- a/doc/TutorialSparse.dox +++ b/doc/TutorialSparse.dox @@ -44,8 +44,8 @@ This storage scheme is better explained on an example. The following matrix and one of its possible sparse, \b column \b major representation: - - + +
Values: 227_3514__1_178
InnerIndices: 12_02 4__2_ 14
Values: 227_35_14_1_178
InnerIndices: 12_02_4_2_ 14
@@ -60,7 +60,7 @@ On the other hand, inserting elements with increasing inner indices in a given i The case where no empty space is available is a special case, and is referred as the \em compressed mode. It corresponds to the widely used Compressed Column (or Row) Storage schemes (CCS or CRS). Any SparseMatrix can be turned to this form by calling the SparseMatrix::makeCompressed() function. -In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j]. +In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we have the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j]. Therefore, in practice a call to SparseMatrix::makeCompressed() frees this buffer. It is worth noting that most of our wrappers to external libraries requires compressed matrices as inputs. diff --git a/doc/examples/matrixfree_cg.cpp b/doc/examples/matrixfree_cg.cpp index 74699381c7e834159317c19002165d3ff7f357b2..cc0eead13d4b87cf1bf562679e37e4905cff58bf 100644 --- a/doc/examples/matrixfree_cg.cpp +++ b/doc/examples/matrixfree_cg.cpp @@ -9,7 +9,7 @@ using Eigen::SparseMatrix; namespace Eigen { namespace internal { - // MatrixReplacement looks-like a SparseMatrix, so let's inherits its traits: + // MatrixReplacement looks-like a SparseMatrix, so let's inherit its traits: template<> struct traits : public Eigen::internal::traits > {}; @@ -66,7 +66,7 @@ namespace internal { { // This method should implement "dst += alpha * lhs * rhs" inplace, // however, for iterative solvers, alpha is always equal to 1, so let's not bother about it. - assert(alpha==Scalar(1) && "scaling is not implemented"); + eigen_assert(alpha==Scalar(1) && "scaling is not implemented"); EIGEN_ONLY_USED_FOR_DEBUG(alpha); // Here we could simply call dst.noalias() += lhs.my_matrix() * rhs, diff --git a/doc/snippets/ComplexEigenSolver_eigenvectors.cpp b/doc/snippets/ComplexEigenSolver_eigenvectors.cpp index bb1c2ccf14b48b8fce1c584abf71281c1107104d..adeed9af64e43535ae708f7b1c7fb3dbd8655008 100644 --- a/doc/snippets/ComplexEigenSolver_eigenvectors.cpp +++ b/doc/snippets/ComplexEigenSolver_eigenvectors.cpp @@ -1,4 +1,4 @@ MatrixXcf ones = MatrixXcf::Ones(3,3); ComplexEigenSolver ces(ones); cout << "The first eigenvector of the 3x3 matrix of ones is:" - << endl << ces.eigenvectors().col(1) << endl; + << endl << ces.eigenvectors().col(0) << endl; diff --git a/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp b/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp index cfc8b0d54b73ea7984211b7e7238ddc575a4e840..94b0d6ebd3c08f1414ae1ae7b5adf87d0da7d0d4 100644 --- a/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp +++ b/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp @@ -1,4 +1,4 @@ MatrixXd ones = MatrixXd::Ones(3,3); SelfAdjointEigenSolver es(ones); cout << "The first eigenvector of the 3x3 matrix of ones is:" - << endl << es.eigenvectors().col(1) << endl; + << endl << es.eigenvectors().col(0) << endl; diff --git a/doc/snippets/Tridiagonalization_decomposeInPlace.cpp b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp index 93dcfca1d6de1bf0e06419c7575a94d7c329fbfa..9a66baa76d1552435a970c512b500b68ed75a1e4 100644 --- a/doc/snippets/Tridiagonalization_decomposeInPlace.cpp +++ b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp @@ -4,7 +4,9 @@ cout << "Here is a random symmetric 5x5 matrix:" << endl << A << endl << endl; VectorXd diag(5); VectorXd subdiag(4); -internal::tridiagonalization_inplace(A, diag, subdiag, true); +VectorXd hcoeffs(4); // Scratch space for householder reflector. +VectorXd workspace(5); +internal::tridiagonalization_inplace(A, diag, subdiag, hcoeffs, workspace, true); cout << "The orthogonal matrix Q is:" << endl << A << endl; cout << "The diagonal of the tridiagonal matrix T is:" << endl << diag << endl; cout << "The subdiagonal of the tridiagonal matrix T is:" << endl << subdiag << endl; diff --git a/doc/snippets/compile_snippet.cpp.in b/doc/snippets/compile_snippet.cpp.in index c11457a3f576df2555fa5d45fbae5a08c29962a5..04f276d0bd0c639475faabe8efe6d87381f1ce16 100644 --- a/doc/snippets/compile_snippet.cpp.in +++ b/doc/snippets/compile_snippet.cpp.in @@ -2,6 +2,7 @@ static bool eigen_did_assert = false; #define eigen_assert(X) if(!eigen_did_assert && !(X)){ std::cout << "### Assertion raised in " << __FILE__ << ":" << __LINE__ << ":\n" #X << "\n### The following would happen without assertions:\n"; eigen_did_assert = true;} #include +#include #include #ifndef M_PI diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 9eec810761e3b85605648c26725f290d8e75539c..8d6d75401690da7e1baacad04ed69bef541ffa44 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -1,10 +1,18 @@ - project(EigenLapack CXX) +if(EIGEN_BUILD_LAPACK AND EIGEN_BUILD_BLAS) + include(CheckLanguage) check_language(Fortran) if(CMAKE_Fortran_COMPILER) enable_language(Fortran) + if("${CMAKE_Fortran_COMPILER_ID}" STREQUAL "GNU") + if ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER_EQUAL 10.0) + # We use an old version of LAPACK with argument type mismatches. + # Allow them to compile anyway with newer GNU versions. + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fallow-argument-mismatch") + endif() + endif() set(EIGEN_Fortran_COMPILER_WORKS ON) else() set(EIGEN_Fortran_COMPILER_WORKS OFF) @@ -88,25 +96,29 @@ endif() endif() -add_library(eigen_lapack_static ${EigenLapack_SRCS} ${ReferenceLapack_SRCS}) -add_library(eigen_lapack SHARED ${EigenLapack_SRCS}) +set(EIGEN_LAPACK_TARGETS "") -target_link_libraries(eigen_lapack eigen_blas) +add_library(eigen_lapack_static ${EigenLapack_SRCS} ${ReferenceLapack_SRCS}) +list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack_static) -if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) - target_link_libraries(eigen_lapack_static ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) - target_link_libraries(eigen_lapack ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) +if (EIGEN_BUILD_SHARED_LIBS) + add_library(eigen_lapack SHARED ${EigenLapack_SRCS}) + list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack) + target_link_libraries(eigen_lapack eigen_blas) endif() -add_dependencies(lapack eigen_lapack eigen_lapack_static) +foreach(target IN LISTS EIGEN_LAPACK_TARGETS) + if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO) + target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO}) + endif() + add_dependencies(lapack ${target}) + install(TARGETS ${target} + RUNTIME DESTINATION bin + LIBRARY DESTINATION lib + ARCHIVE DESTINATION lib) +endforeach() -install(TARGETS eigen_lapack eigen_lapack_static - RUNTIME DESTINATION bin - LIBRARY DESTINATION lib - ARCHIVE DESTINATION lib) - - get_filename_component(eigen_full_path_to_testing_lapack "./testing/" ABSOLUTE) if(EXISTS ${eigen_full_path_to_testing_lapack}) @@ -141,6 +153,7 @@ if(EXISTS ${eigen_full_path_to_testing_lapack}) string(REPLACE "." "_" input_name ${input}) set(testName "${target}_${input_name}") if(EXISTS "${TEST_INPUT}") + add_dependencies(buildtests ${target}) add_test(NAME LAPACK-${testName} COMMAND "${CMAKE_COMMAND}" -DTEST=$ @@ -446,3 +459,6 @@ if(EXISTS ${eigen_full_path_to_testing_lapack}) endif() +elseif(EIGEN_BUILD_LAPACK AND NOT EIGEN_BUILD_BLAS) + message(FATAL_ERROR "EIGEN_BUILD_LAPACK requires EIGEN_BUILD_BLAS") +endif() #EIGEN_BUILD_LAPACK diff --git a/test/AnnoyingScalar.h b/test/AnnoyingScalar.h index 0f8e70d360eba9ec6b1031b9fd689af909ca08e9..b621887275df49477656d23da0e31432bf65010f 100644 --- a/test/AnnoyingScalar.h +++ b/test/AnnoyingScalar.h @@ -126,7 +126,7 @@ template<> struct NumTraits : NumTraits { enum { - RequireInitialization = true + RequireInitialization = 1 }; typedef AnnoyingScalar Real; typedef AnnoyingScalar Nested; @@ -145,10 +145,6 @@ bool (isfinite)(const AnnoyingScalar& x) { } namespace internal { - template<> EIGEN_STRONG_INLINE AnnoyingScalar pcmp_eq(const AnnoyingScalar& a, const AnnoyingScalar& b) - { return AnnoyingScalar(pcmp_eq(*a.v, *b.v)); } - template<> EIGEN_STRONG_INLINE AnnoyingScalar pselect(const AnnoyingScalar& mask, const AnnoyingScalar& a, const AnnoyingScalar& b) - { return numext::equal_strict(*mask.v, 0.f) ? b : a; } template<> EIGEN_STRONG_INLINE double cast(const AnnoyingScalar& x) { return double(*x.v); } template<> EIGEN_STRONG_INLINE float cast(const AnnoyingScalar& x) { return *x.v; } } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 56664e783ecde7cb24f408f36ade2d9be0b1820c..dbd4bc618c23afa77a958c7e2562cce436319bc1 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -42,45 +42,53 @@ endif() set(SPARSE_LIBS " ") find_package(CHOLMOD) -if(CHOLMOD_FOUND) +if(CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK) add_definitions("-DEIGEN_CHOLMOD_SUPPORT") include_directories(${CHOLMOD_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) set(CHOLMOD_ALL_LIBS ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ") + + ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ") endif() find_package(UMFPACK) -if(UMFPACK_FOUND) +if(UMFPACK_FOUND AND EIGEN_BUILD_BLAS) add_definitions("-DEIGEN_UMFPACK_SUPPORT") include_directories(${UMFPACK_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ") + + ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ") endif() find_package(KLU) -if(KLU_FOUND) +if(KLU_FOUND AND EIGEN_BUILD_BLAS) add_definitions("-DEIGEN_KLU_SUPPORT") include_directories(${KLU_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ") + + ei_add_test(klu_support "" "${KLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ") endif() find_package(SuperLU 4.0) -if(SuperLU_FOUND) +if(SuperLU_FOUND AND EIGEN_BUILD_BLAS) add_definitions("-DEIGEN_SUPERLU_SUPPORT") include_directories(${SUPERLU_INCLUDES}) set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES}) ei_add_property(EIGEN_TESTED_BACKENDS "SuperLU, ") + + ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") else() ei_add_property(EIGEN_MISSING_BACKENDS "SuperLU, ") endif() @@ -124,7 +132,7 @@ else() endif() find_package(SPQR) -if(SPQR_FOUND AND CHOLMOD_FOUND AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) ) +if(SPQR_FOUND AND CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) ) add_definitions("-DEIGEN_SPQR_SUPPORT") include_directories(${SPQR_INCLUDES}) set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${LAPACK_LIBRARIES}) @@ -152,6 +160,7 @@ endif() set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official") add_custom_target(BuildOfficial) +ei_add_test(clz) ei_add_test(rand) ei_add_test(meta) ei_add_test(numext) @@ -164,7 +173,6 @@ ei_add_test(nullary) ei_add_test(mixingtypes) ei_add_test(io) ei_add_test(packetmath "-DEIGEN_FAST_MATH=1") -ei_add_test(unalignedassert) ei_add_test(vectorization_logic) ei_add_test(basicstuff) ei_add_test(constructor) @@ -311,22 +319,6 @@ if(QT4_FOUND) ei_add_test(qtvector "" "${QT_QTCORE_LIBRARY}") endif() -if(UMFPACK_FOUND) - ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}") -endif() - -if(KLU_FOUND OR SuiteSparse_FOUND) - ei_add_test(klu_support "" "${KLU_ALL_LIBS}") -endif() - -if(SUPERLU_FOUND) - ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}") -endif() - -if(CHOLMOD_FOUND) - ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}") -endif() - if(PARDISO_FOUND) ei_add_test(pardiso_support "" "${PARDISO_ALL_LIBS}") endif() @@ -335,7 +327,7 @@ if(PASTIX_FOUND AND (SCOTCH_FOUND OR METIS_FOUND)) ei_add_test(pastix_support "" "${PASTIX_ALL_LIBS}") endif() -if(SPQR_FOUND AND CHOLMOD_FOUND) +if(SPQR_FOUND AND CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK) ei_add_test(spqr_support "" "${SPQR_ALL_LIBS}") endif() @@ -384,40 +376,38 @@ if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang") message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.") endif() -if(EIGEN_TEST_CUDA) +find_package(CUDA 9.0) +if(CUDA_FOUND AND EIGEN_TEST_CUDA) + # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor + # and -fno-check-new flags since they trigger thousands of compilation warnings + # in the CUDA runtime + string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") -find_package(CUDA 5.0) -if(CUDA_FOUND) - - set(CUDA_PROPAGATE_HOST_FLAGS OFF) - - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr") - if (${CUDA_VERSION} STREQUAL "7.0") - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") - endif() - - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE) - endif() if(EIGEN_TEST_CUDA_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}") foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}") endforeach() + string(APPEND CMAKE_CXX_FLAGS " ${EIGEN_CUDA_CXX_FLAGS}") else() - foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) - string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}") + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + set(NVCC_ARCH_FLAGS) + foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") endforeach() + set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}") + cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") endif() - string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - + ei_add_test(gpu_basic) - - unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) -endif() + unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) endif() @@ -429,8 +419,8 @@ if (EIGEN_TEST_HIP) set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.") if (EXISTS ${HIP_PATH}) - - list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) + + list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) find_package(HIP REQUIRED) if (HIP_FOUND) @@ -444,12 +434,12 @@ if (EIGEN_TEST_HIP) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(gpu_basic) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) - + elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia")) message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen") else () message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}") - endif() + endif() endif() else () message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist") diff --git a/test/SafeScalar.h b/test/SafeScalar.h new file mode 100644 index 0000000000000000000000000000000000000000..c5cb75717c7c0131814ee98ceb840ac83b48dd26 --- /dev/null +++ b/test/SafeScalar.h @@ -0,0 +1,30 @@ + +// A Scalar that asserts for uninitialized access. +template +class SafeScalar { + public: + SafeScalar() : initialized_(false) {} + SafeScalar(const SafeScalar& other) { + *this = other; + } + SafeScalar& operator=(const SafeScalar& other) { + val_ = T(other); + initialized_ = true; + return *this; + } + + SafeScalar(T val) : val_(val), initialized_(true) {} + SafeScalar& operator=(T val) { + val_ = val; + initialized_ = true; + } + + operator T() const { + VERIFY(initialized_ && "Uninitialized access."); + return val_; + } + + private: + T val_; + bool initialized_; +}; diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp index 1bc8e19f91a915c7a274015a4814a40455546a75..2388830902ddb5df4a6952b07e1a8f84d4918ffd 100644 --- a/test/array_cwise.cpp +++ b/test/array_cwise.cpp @@ -72,9 +72,9 @@ void pow_test() { for (int j = 0; j < num_cases; ++j) { Scalar e = static_cast(std::pow(x(i,j), y(i,j))); Scalar a = actual(i, j); - bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((numext::isnan)(a) && (numext::isnan)(e)); - all_pass &= !fail; - if (fail) { + bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e)); + all_pass &= success; + if (!success) { std::cout << "pow(" << x(i,j) << "," << y(i,j) << ") = " << a << " != " << e << std::endl; } } @@ -626,6 +626,41 @@ template void min_max(const ArrayType& m) } } +template +struct shift_left { + template + Scalar operator()(const Scalar& v) const { + return v << N; + } +}; + +template +struct arithmetic_shift_right { + template + Scalar operator()(const Scalar& v) const { + return v >> N; + } +}; + +template void array_integer(const ArrayType& m) +{ + Index rows = m.rows(); + Index cols = m.cols(); + + ArrayType m1 = ArrayType::Random(rows, cols), + m2(rows, cols); + + m2 = m1.template shiftLeft<2>(); + VERIFY( (m2 == m1.unaryExpr(shift_left<2>())).all() ); + m2 = m1.template shiftLeft<9>(); + VERIFY( (m2 == m1.unaryExpr(shift_left<9>())).all() ); + + m2 = m1.template shiftRight<2>(); + VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<2>())).all() ); + m2 = m1.template shiftRight<9>(); + VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<9>())).all() ); +} + EIGEN_DECLARE_TEST(array_cwise) { for(int i = 0; i < g_repeat; i++) { @@ -636,6 +671,8 @@ EIGEN_DECLARE_TEST(array_cwise) CALL_SUBTEST_5( array(ArrayXXf(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( array(ArrayXXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); CALL_SUBTEST_6( array(Array(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_6( array_integer(ArrayXXi(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); + CALL_SUBTEST_6( array_integer(Array(internal::random(1,EIGEN_TEST_MAX_SIZE), internal::random(1,EIGEN_TEST_MAX_SIZE))) ); } for(int i = 0; i < g_repeat; i++) { CALL_SUBTEST_1( comparisons(Array()) ); diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp index fb6be351e69af963db61c903ea0e9f6e3c6d52da..06e04a2fa04fbd8d94c3427d04f770438538ca55 100644 --- a/test/array_for_matrix.cpp +++ b/test/array_for_matrix.cpp @@ -211,6 +211,40 @@ template void cwise_min_max(const MatrixType& m) VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, maxM1).array(), (m1.array().max)( maxM1)); VERIFY_IS_APPROX(m1.array(), (m1.array().max)( minM1)); + // Test NaN propagation for min/max. + if (!NumTraits::IsInteger) { + m1(0,0) = NumTraits::quiet_NaN(); + // Elementwise. + VERIFY((numext::isnan)(m1.template cwiseMax(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY((numext::isnan)(m1.template cwiseMin(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMax(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMin(MatrixType::Constant(rows,cols, Scalar(1)))(0,0))); + VERIFY((numext::isnan)(m1.template cwiseMax(Scalar(1))(0,0))); + VERIFY((numext::isnan)(m1.template cwiseMin(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMax(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.template cwiseMin(Scalar(1))(0,0))); + + + VERIFY((numext::isnan)(m1.array().template max(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY((numext::isnan)(m1.array().template min(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY(!(numext::isnan)(m1.array().template max(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY(!(numext::isnan)(m1.array().template min(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0))); + VERIFY((numext::isnan)(m1.array().template max(Scalar(1))(0,0))); + VERIFY((numext::isnan)(m1.array().template min(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.array().template max(Scalar(1))(0,0))); + VERIFY(!(numext::isnan)(m1.array().template min(Scalar(1))(0,0))); + + // Reductions. + VERIFY((numext::isnan)(m1.template maxCoeff())); + VERIFY((numext::isnan)(m1.template minCoeff())); + if (m1.size() > 1) { + VERIFY(!(numext::isnan)(m1.template maxCoeff())); + VERIFY(!(numext::isnan)(m1.template minCoeff())); + } else { + VERIFY((numext::isnan)(m1.template maxCoeff())); + VERIFY((numext::isnan)(m1.template minCoeff())); + } + } } template void resize(const MatrixTraits& t) diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp index e92a7dc97d185d7b2e67992b09435f01f1a08d31..41303775c74cfa9c0eaa8793a6e57d52d0e7040e 100644 --- a/test/bdcsvd.cpp +++ b/test/bdcsvd.cpp @@ -54,20 +54,46 @@ void bdcsvd_method() VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m); } -// compare the Singular values returned with Jacobi and Bdc +// Compare the Singular values returned with Jacobi and Bdc. template -void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0) +void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0, int algoswap = 16, bool random = true) { - MatrixType m = MatrixType::Random(a.rows(), a.cols()); - BDCSVD bdc_svd(m); + MatrixType m = random ? MatrixType::Random(a.rows(), a.cols()) : a; + + BDCSVD bdc_svd(m.rows(), m.cols(), computationOptions); + bdc_svd.setSwitchSize(algoswap); + bdc_svd.compute(m); + JacobiSVD jacobi_svd(m); VERIFY_IS_APPROX(bdc_svd.singularValues(), jacobi_svd.singularValues()); + if(computationOptions & ComputeFullU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU()); if(computationOptions & ComputeThinU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU()); if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV()); if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV()); } +// Verifies total deflation is **not** triggered. +void compare_bdc_jacobi_instance(bool structure_as_m, int algoswap = 16) +{ + MatrixXd m(4, 3); + if (structure_as_m) { + // The first 3 rows are the reduced form of Matrix 1 as shown below, and it + // has nonzero elements in the first column and diagonals only. + m << 1.056293, 0, 0, + -0.336468, 0.907359, 0, + -1.566245, 0, 0.149150, + -0.1, 0, 0; + } else { + // Matrix 1. + m << 0.882336, 18.3914, -26.7921, + -5.58135, 17.1931, -24.0892, + -20.794, 8.68496, -4.83103, + -8.4981, -10.5451, 23.9072; + } + compare_bdc_jacobi(m, 0, algoswap, false); +} + EIGEN_DECLARE_TEST(bdcsvd) { CALL_SUBTEST_3(( svd_verify_assert >(Matrix3f()) )); @@ -114,5 +140,13 @@ EIGEN_DECLARE_TEST(bdcsvd) // CALL_SUBTEST_9( svd_preallocate() ); CALL_SUBTEST_2( svd_underoverflow() ); + + // Without total deflation issues. + CALL_SUBTEST_11(( compare_bdc_jacobi_instance(true) )); + CALL_SUBTEST_12(( compare_bdc_jacobi_instance(false) )); + + // With total deflation issues before, when it shouldn't be triggered. + CALL_SUBTEST_13(( compare_bdc_jacobi_instance(true, 3) )); + CALL_SUBTEST_14(( compare_bdc_jacobi_instance(false, 3) )); } diff --git a/test/bfloat16_float.cpp b/test/bfloat16_float.cpp index 1df22f73e5cad2ee749920c69ebe3004828373c7..c3de0b19a0ce6aaa8b65140dd2cebd88e33a1de1 100644 --- a/test/bfloat16_float.cpp +++ b/test/bfloat16_float.cpp @@ -32,18 +32,6 @@ float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa, return dest; } -void test_truncate(float input, float expected_truncation, float expected_rounding){ - bfloat16 truncated = Eigen::bfloat16_impl::truncate_to_bfloat16(input); - bfloat16 rounded = Eigen::bfloat16_impl::float_to_bfloat16_rtne(input); - if ((numext::isnan)(input)){ - VERIFY((numext::isnan)(static_cast(truncated)) || (numext::isinf)(static_cast(truncated))); - VERIFY((numext::isnan)(static_cast(rounded)) || (numext::isinf)(static_cast(rounded))); - return; - } - VERIFY_IS_EQUAL(expected_truncation, static_cast(truncated)); - VERIFY_IS_EQUAL(expected_rounding, static_cast(rounded)); -} - template void test_roundtrip() { // Representable T round trip via bfloat16 @@ -122,31 +110,6 @@ void test_conversion() VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.0f), 0x0000); VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-0.0f), 0x8000); - // Flush denormals to zero - for (float denorm = -std::numeric_limits::denorm_min(); - denorm < std::numeric_limits::denorm_min(); - denorm = nextafterf(denorm, 1.0f)) { - bfloat16 bf_trunc = Eigen::bfloat16_impl::truncate_to_bfloat16(denorm); - VERIFY_IS_EQUAL(static_cast(bf_trunc), 0.0f); - - // Implicit conversion of denormls to bool is correct - VERIFY_IS_EQUAL(static_cast(bfloat16(denorm)), false); - VERIFY_IS_EQUAL(bfloat16(denorm), false); - - if (std::signbit(denorm)) { - VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x8000); - } else { - VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x0000); - } - bfloat16 bf_round = Eigen::bfloat16_impl::float_to_bfloat16_rtne(denorm); - VERIFY_IS_EQUAL(static_cast(bf_round), 0.0f); - if (std::signbit(denorm)) { - VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x8000); - } else { - VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x0000); - } - } - // Default is zero VERIFY_IS_EQUAL(static_cast(bfloat16()), 0.0f); @@ -156,52 +119,6 @@ void test_conversion() test_roundtrip >(); test_roundtrip >(); - // Truncate test - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0xf5c3), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x49, 0x0000)); - test_truncate( - BinaryToFloat(1, 0x80, 0x48, 0xf5c3), - BinaryToFloat(1, 0x80, 0x48, 0x0000), - BinaryToFloat(1, 0x80, 0x49, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0x8000), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000)); - test_truncate( - BinaryToFloat(0, 0xff, 0x00, 0x0001), - BinaryToFloat(0, 0xff, 0x40, 0x0000), - BinaryToFloat(0, 0xff, 0x40, 0x0000)); - test_truncate( - BinaryToFloat(0, 0xff, 0x7f, 0xffff), - BinaryToFloat(0, 0xff, 0x40, 0x0000), - BinaryToFloat(0, 0xff, 0x40, 0x0000)); - test_truncate( - BinaryToFloat(1, 0x80, 0x48, 0xc000), - BinaryToFloat(1, 0x80, 0x48, 0x0000), - BinaryToFloat(1, 0x80, 0x49, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0x4000), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x80, 0x48, 0x8000), - BinaryToFloat(0, 0x80, 0x48, 0x0000), - BinaryToFloat(0, 0x80, 0x48, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x00, 0x48, 0x8000), - BinaryToFloat(0, 0x00, 0x00, 0x0000), - BinaryToFloat(0, 0x00, 0x00, 0x0000)); - test_truncate( - BinaryToFloat(0, 0x00, 0x7f, 0xc000), - BinaryToFloat(0, 0x00, 0x00, 0x0000), - BinaryToFloat(0, 0x00, 0x00, 0x0000)); - // Conversion Array a; for (int i = 0; i < 100; i++) a(i) = i + 1.25; @@ -250,12 +167,6 @@ void test_conversion() VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)), 0x7fc0); VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)), 0xffc0); - VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16( - BinaryToFloat(0x0, 0xff, 0x40, 0x0)), - 0x7fc0); - VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16( - BinaryToFloat(0x1, 0xff, 0x40, 0x0)), - 0xffc0); } void test_numtraits() diff --git a/test/block.cpp b/test/block.cpp index 84124aba642b38521d864a5b130ac3cab8b86684..667a3be3914308978bcb94fa49971e65fbbd326e 100644 --- a/test/block.cpp +++ b/test/block.cpp @@ -143,11 +143,12 @@ template void block(const MatrixType& m) // check that linear acccessors works on blocks m1 = m1_copy; - if((MatrixType::Flags&RowMajorBit)==0) - VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1+c1*rows), m1(r1,c1)); - else - VERIFY_IS_EQUAL(m1.topRows(r1).coeff(c1+r1*cols), m1(r1,c1)); - + if (c1 > 0 && r1 > 0) { + if ((MatrixType::Flags & RowMajorBit) == 0) + VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1 + c1 * rows), m1(r1, c1)); + else + VERIFY_IS_EQUAL(m1.topRows(r1).coeff(c1 + r1 * cols), m1(r1, c1)); + } // now test some block-inside-of-block. diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp index 7c79ded23bc49639ded9cda751c10bdae0fe0ee0..e83e9704479c9a354677f60f0547d464e9b4ea4c 100644 --- a/test/boostmultiprec.cpp +++ b/test/boostmultiprec.cpp @@ -74,8 +74,7 @@ #include #include -namespace mp = boost::multiprecision; -typedef mp::number, mp::et_on> Real; +typedef boost::multiprecision::number, boost::multiprecision::et_on> Real; namespace Eigen { template<> struct NumTraits : GenericNumTraits { diff --git a/test/clz.cpp b/test/clz.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1d08b47155c8771731154e4687fdf40435bc7fff --- /dev/null +++ b/test/clz.cpp @@ -0,0 +1,74 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2023 The Eigen Authors +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#include "main.h" + +template +int ref_clz(T val) { + static const int kNumBits = sizeof(T) * CHAR_BIT; + T kMsbMask = T(1) << (kNumBits - 1); + int z = 0; + for (; z < kNumBits && ((val & kMsbMask) == 0); ++z) { + val <<= 1; + } + return z; +} + +template +int ref_ctz(T val) { + static const int kNumBits = sizeof(T) * CHAR_BIT; + T kLsbMask = T(1); + int z = 0; + for (; z < kNumBits && ((val & kLsbMask) == 0); ++z) { + val >>= 1; + } + return z; +} + +template +void test_clz_ctz() { + T step = sizeof(T) <= 2 ? 1 : (Eigen::NumTraits::highest() / (T(1) << 16)); + T iters = Eigen::NumTraits::highest() / step; + for (T i = 0; i < iters; ++i) { + T val = i * step; + int expected_clz = ref_clz(val); + int actual_clz = Eigen::internal::clz(val); + VERIFY(expected_clz == actual_clz); + + int expected_ctz = ref_ctz(val); + int actual_ctz = Eigen::internal::ctz(val); + VERIFY(expected_ctz == actual_ctz); + } +} + +template +void test_clz_ctz_random() { + for (int i = 0; i < 1024 * 1024; ++i) { + T val = Eigen::internal::random(); + int expected_clz = ref_clz(val); + int actual_clz = Eigen::internal::clz(val); + VERIFY(expected_clz == actual_clz); + + int expected_ctz = ref_ctz(val); + int actual_ctz = Eigen::internal::ctz(val); + VERIFY(expected_ctz == actual_ctz); + } +} + +EIGEN_DECLARE_TEST(clz) { + CALL_SUBTEST_1(test_clz_ctz()); + CALL_SUBTEST_2(test_clz_ctz()); + CALL_SUBTEST_3(test_clz_ctz()); + CALL_SUBTEST_4(test_clz_ctz()); + + for (int i = 0; i < g_repeat; i++) { + test_clz_ctz_random(); + test_clz_ctz_random(); + } +} diff --git a/test/conservative_resize.cpp b/test/conservative_resize.cpp index 5dc500068c232cfe5401ed7f5a757308cbbb5b05..d48eb126fdb1fca779981e258f0c82430c1a01d1 100644 --- a/test/conservative_resize.cpp +++ b/test/conservative_resize.cpp @@ -115,9 +115,11 @@ template void noncopyable() { typedef Eigen::Matrix VectorType; typedef Eigen::Matrix MatrixType; - + { +#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW AnnoyingScalar::dont_throw = true; +#endif int n = 50; VectorType v0(n), v1(n); MatrixType m0(n,n), m1(n,n), m2(n,n); @@ -148,6 +150,7 @@ EIGEN_DECLARE_TEST(conservative_resize) CALL_SUBTEST_4((run_matrix_tests, Eigen::ColMajor>())); CALL_SUBTEST_5((run_matrix_tests, Eigen::RowMajor>())); CALL_SUBTEST_5((run_matrix_tests, Eigen::ColMajor>())); + CALL_SUBTEST_1((run_matrix_tests())); CALL_SUBTEST_1((run_vector_tests())); CALL_SUBTEST_2((run_vector_tests())); @@ -155,7 +158,9 @@ EIGEN_DECLARE_TEST(conservative_resize) CALL_SUBTEST_4((run_vector_tests >())); CALL_SUBTEST_5((run_vector_tests >())); +#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW AnnoyingScalar::dont_throw = true; +#endif CALL_SUBTEST_6(( run_vector_tests() )); CALL_SUBTEST_6(( noncopyable<0>() )); } diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp index 7fa25859dded034adcabba0ed0e151cfc6fbb2cb..45c2bd728a993ee4747901536aea2876cbf18126 100644 --- a/test/dense_storage.cpp +++ b/test/dense_storage.cpp @@ -8,17 +8,27 @@ // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. #include "main.h" +#include "AnnoyingScalar.h" +#include "SafeScalar.h" #include -template -void dense_storage_copy() +#if EIGEN_HAS_TYPE_TRAITS && EIGEN_HAS_CXX11 +using DenseStorageD3x3 = Eigen::DenseStorage; +static_assert(std::is_trivially_move_constructible::value, "DenseStorage not trivially_move_constructible"); +static_assert(std::is_trivially_move_assignable::value, "DenseStorage not trivially_move_assignable"); +#if !defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN) +static_assert(std::is_trivially_copy_constructible::value, "DenseStorage not trivially_copy_constructible"); +static_assert(std::is_trivially_copy_assignable::value, "DenseStorage not trivially_copy_assignable"); +static_assert(std::is_trivially_copyable::value, "DenseStorage not trivially_copyable"); +#endif +#endif + +template +void dense_storage_copy(int rows, int cols) { - static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols); - typedef DenseStorage DenseStorageType; + typedef DenseStorage DenseStorageType; - const int rows = (Rows==Dynamic) ? 4 : Rows; - const int cols = (Cols==Dynamic) ? 3 : Cols; const int size = rows*cols; DenseStorageType reference(size, rows, cols); T* raw_reference = reference.data(); @@ -31,14 +41,11 @@ void dense_storage_copy() VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]); } -template -void dense_storage_assignment() +template +void dense_storage_assignment(int rows, int cols) { - static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols); - typedef DenseStorage DenseStorageType; + typedef DenseStorage DenseStorageType; - const int rows = (Rows==Dynamic) ? 4 : Rows; - const int cols = (Cols==Dynamic) ? 3 : Cols; const int size = rows*cols; DenseStorageType reference(size, rows, cols); T* raw_reference = reference.data(); @@ -52,6 +59,34 @@ void dense_storage_assignment() VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]); } +template +void dense_storage_swap(int rows0, int cols0, int rows1, int cols1) +{ + typedef DenseStorage DenseStorageType; + + const int size0 = rows0*cols0; + DenseStorageType a(size0, rows0, cols0); + for (int i=0; i(i); + } + + const int size1 = rows1*cols1; + DenseStorageType b(size1, rows1, cols1); + for (int i=0; i(-i); + } + + a.swap(b); + + for (int i=0; i(i)); + } + + for (int i=0; i(-i)); + } +} + template void dense_storage_alignment() { @@ -78,30 +113,78 @@ void dense_storage_alignment() #endif } -EIGEN_DECLARE_TEST(dense_storage) -{ - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); - - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); - dense_storage_copy(); +template +void dense_storage_tests() { + // Dynamic Storage. + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + // Fixed Storage. + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + // Fixed Storage with Uninitialized Elements. + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); + dense_storage_copy(4, 3); - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); - - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); - dense_storage_assignment(); + // Dynamic Storage. + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + // Fixed Storage. + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + // Fixed Storage with Uninitialized Elements. + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + dense_storage_assignment(4, 3); + + // Dynamic Storage. + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 1); + dense_storage_swap(2, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 3); + dense_storage_swap(2, 3, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 4, 1); + dense_storage_swap(4, 1, 4, 3); + // Fixed Storage. + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 1); + dense_storage_swap(2, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 4, 1); + dense_storage_swap(4, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 3); + dense_storage_swap(2, 3, 4, 3); + // Fixed Storage with Uninitialized Elements. + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 1); + dense_storage_swap(2, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 4, 1); + dense_storage_swap(4, 1, 4, 3); + dense_storage_swap(4, 3, 4, 3); + dense_storage_swap(4, 3, 2, 3); + dense_storage_swap(2, 3, 4, 3); + + dense_storage_alignment(); + dense_storage_alignment(); + dense_storage_alignment(); + dense_storage_alignment(); +} - dense_storage_alignment(); - dense_storage_alignment(); - dense_storage_alignment(); - dense_storage_alignment(); +EIGEN_DECLARE_TEST(dense_storage) +{ + dense_storage_tests(); + dense_storage_tests(); + dense_storage_tests >(); + dense_storage_tests(); } diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp index 95ed431db70a040e63f1183a9ccfce8cb41ed76c..a0c99b18a37a2c66bad959c99adede749d88f6dc 100644 --- a/test/eigensolver_generalized_real.cpp +++ b/test/eigensolver_generalized_real.cpp @@ -85,6 +85,42 @@ template void generalized_eigensolver_real(const MatrixType } } +template +void generalized_eigensolver_assert() { + GeneralizedEigenSolver eig; + // all raise assert if uninitialized + VERIFY_RAISES_ASSERT(eig.info()); + VERIFY_RAISES_ASSERT(eig.eigenvectors()); + VERIFY_RAISES_ASSERT(eig.eigenvalues()); + VERIFY_RAISES_ASSERT(eig.alphas()); + VERIFY_RAISES_ASSERT(eig.betas()); + + // none raise assert after compute called + eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20)); + VERIFY(eig.info() == Success); + eig.eigenvectors(); + eig.eigenvalues(); + eig.alphas(); + eig.betas(); + + // eigenvectors() raises assert, if eigenvectors were not requested + eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20), false); + VERIFY(eig.info() == Success); + VERIFY_RAISES_ASSERT(eig.eigenvectors()); + eig.eigenvalues(); + eig.alphas(); + eig.betas(); + + // all except info raise assert if realQZ did not converge + eig.setMaxIterations(0); // force real QZ to fail. + eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20)); + VERIFY(eig.info() == NoConvergence); + VERIFY_RAISES_ASSERT(eig.eigenvectors()); + VERIFY_RAISES_ASSERT(eig.eigenvalues()); + VERIFY_RAISES_ASSERT(eig.alphas()); + VERIFY_RAISES_ASSERT(eig.betas()); +} + EIGEN_DECLARE_TEST(eigensolver_generalized_real) { for(int i = 0; i < g_repeat; i++) { @@ -98,6 +134,7 @@ EIGEN_DECLARE_TEST(eigensolver_generalized_real) CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) ); CALL_SUBTEST_3( generalized_eigensolver_real(Matrix()) ); CALL_SUBTEST_4( generalized_eigensolver_real(Matrix2d()) ); + CALL_SUBTEST_5( generalized_eigensolver_assert() ); TEST_SET_BUT_UNUSED_VARIABLE(s) } } diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp index 65b80c3fb1c71494808bd938037a5b79e2494a57..0fb2f4da766209ab394459bdc969e97bba1d6bb5 100644 --- a/test/eigensolver_selfadjoint.cpp +++ b/test/eigensolver_selfadjoint.cpp @@ -234,15 +234,21 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) { int s = 0; for(int i = 0; i < g_repeat; i++) { + // trivial test for 1x1 matrices: CALL_SUBTEST_1( selfadjointeigensolver(Matrix())); CALL_SUBTEST_1( selfadjointeigensolver(Matrix())); + CALL_SUBTEST_1( selfadjointeigensolver(Matrix, 1, 1>())); + // very important to test 3x3 and 2x2 matrices since we provide special paths for them CALL_SUBTEST_12( selfadjointeigensolver(Matrix2f()) ); CALL_SUBTEST_12( selfadjointeigensolver(Matrix2d()) ); + CALL_SUBTEST_12( selfadjointeigensolver(Matrix2cd()) ); CALL_SUBTEST_13( selfadjointeigensolver(Matrix3f()) ); CALL_SUBTEST_13( selfadjointeigensolver(Matrix3d()) ); + CALL_SUBTEST_13( selfadjointeigensolver(Matrix3cd()) ); CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) ); + CALL_SUBTEST_2( selfadjointeigensolver(Matrix4cd()) ); s = internal::random(1,EIGEN_TEST_MAX_SIZE/4); CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) ); @@ -254,6 +260,8 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint) // some trivial but implementation-wise tricky cases CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(1,1)) ); CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(2,2)) ); + CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(1,1)) ); + CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(2,2)) ); CALL_SUBTEST_6( selfadjointeigensolver(Matrix()) ); CALL_SUBTEST_7( selfadjointeigensolver(Matrix()) ); } diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp index 2c89ecd2130ba84b03695e52e0dba9223e4ebea5..44b2f2aecb3e8e9b239f93cc586778f0b11b779d 100644 --- a/test/geo_hyperplane.cpp +++ b/test/geo_hyperplane.cpp @@ -172,11 +172,6 @@ template void hyperplane_alignment() VERIFY_IS_APPROX(p1->coeffs(), p2->coeffs()); VERIFY_IS_APPROX(p1->coeffs(), p3->coeffs()); - - #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES > 0 - if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) - VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Plane3a)); - #endif } diff --git a/test/geo_orthomethods.cpp b/test/geo_orthomethods.cpp index b7b660740284d3bb94b14693a4a1134bfb8f6eac..5f7ddb91f730e02428867bd1a019377ac83ad19f 100644 --- a/test/geo_orthomethods.cpp +++ b/test/geo_orthomethods.cpp @@ -73,8 +73,9 @@ template void orthomethods_3() // check mixed product typedef Matrix RealVector3; RealVector3 rv1 = RealVector3::Random(); - VERIFY_IS_APPROX(v1.cross(rv1.template cast()), v1.cross(rv1)); - VERIFY_IS_APPROX(rv1.template cast().cross(v1), rv1.cross(v1)); + v2 = rv1.template cast(); + VERIFY_IS_APPROX(v1.cross(v2), v1.cross(rv1)); + VERIFY_IS_APPROX(v2.cross(v1), rv1.cross(v1)); } template void orthomethods(int size=Size) diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp index 7135c8fa54d8162b280f8eee9577052abb43573b..e4b194abc407a7b45bbc437688462051b1132864 100644 --- a/test/geo_parametrizedline.cpp +++ b/test/geo_parametrizedline.cpp @@ -110,11 +110,6 @@ template void parametrizedline_alignment() VERIFY_IS_APPROX(p1->origin(), p3->origin()); VERIFY_IS_APPROX(p1->direction(), p2->direction()); VERIFY_IS_APPROX(p1->direction(), p3->direction()); - - #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 - if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) - VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Line4a)); - #endif } EIGEN_DECLARE_TEST(geo_parametrizedline) diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp index c4a3162b357b5ef6034729083e7089147844ff5b..c561fc89d0e8f314354468824cce8a3bfd8d3511 100644 --- a/test/geo_quaternion.cpp +++ b/test/geo_quaternion.cpp @@ -218,10 +218,6 @@ template void mapQuaternion(void){ VERIFY_IS_APPROX(q1.coeffs(), q2.coeffs()); VERIFY_IS_APPROX(q1.coeffs(), q3.coeffs()); VERIFY_IS_APPROX(q4.coeffs(), q3.coeffs()); - #ifdef EIGEN_VECTORIZE - if(internal::packet_traits::Vectorizable) - VERIFY_RAISES_ASSERT((MQuaternionA(array3unaligned))); - #endif VERIFY_IS_APPROX(mq1 * (mq1.inverse() * v1), v1); VERIFY_IS_APPROX(mq1 * (mq1.conjugate() * v1), v1); @@ -281,10 +277,6 @@ template void quaternionAlignment(void){ VERIFY_IS_APPROX(q1->coeffs(), q2->coeffs()); VERIFY_IS_APPROX(q1->coeffs(), q3->coeffs()); - #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 - if(internal::packet_traits::Vectorizable && internal::packet_traits::size<=4) - VERIFY_RAISES_ASSERT((::new(reinterpret_cast(arrayunaligned)) QuaternionA)); - #endif } template void check_const_correctness(const PlainObjectType&) diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp index d433561cbb1387e642c71e47d08241a98625a64d..72c6edac14708ffb3969855dd7096ca94504773f 100644 --- a/test/geo_transformations.cpp +++ b/test/geo_transformations.cpp @@ -582,11 +582,6 @@ template void transform_alignment() VERIFY_IS_APPROX(p1->matrix(), p3->matrix()); VERIFY_IS_APPROX( (*p1) * (*p1), (*p2)*(*p3)); - - #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0 - if(internal::packet_traits::Vectorizable) - VERIFY_RAISES_ASSERT((::new(reinterpret_cast(array3u)) Projective3a)); - #endif } template void transform_products() diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu index bf8dcacde280039e0c2eb4c329659ca7cd5bdff8..e424a93c9e86959635d64734588a97e83158f072 100644 --- a/test/gpu_basic.cu +++ b/test/gpu_basic.cu @@ -138,10 +138,12 @@ struct complex_operators { out[out_idx++] = a / numext::real(b); out[out_idx++] = numext::real(a) / b; +#if !defined(EIGEN_COMP_MSVC) out[out_idx] = a; out[out_idx++] += b; out[out_idx] = a; out[out_idx++] -= b; out[out_idx] = a; out[out_idx++] *= b; out[out_idx] = a; out[out_idx++] /= b; +#endif const ComplexType true_value = ComplexType(ValueType(1), ValueType(0)); const ComplexType false_value = ComplexType(ValueType(0), ValueType(0)); @@ -188,6 +190,7 @@ struct complex_operators { res.segment(block_idx, size) = x1.real().array() / x2.array(); block_idx += size; +#if !defined(EIGEN_COMP_MSVC) res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2; block_idx += size; res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2; @@ -196,19 +199,19 @@ struct complex_operators { block_idx += size; res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array(); block_idx += size; +#endif - // Equality comparisons currently not functional on device - // (std::equal_to is host-only). - // const T true_vector = T::Constant(true_value); - // const T false_vector = T::Constant(false_value); - // res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector); - // block_idx += size; + const T true_vector = T::Constant(true_value); + const T false_vector = T::Constant(false_value); + res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector); + block_idx += size; + // Mixing types in equality comparison does not work. // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector); // block_idx += size; // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector); // block_idx += size; - // res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector); - // block_idx += size; + res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector); + block_idx += size; // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector); // block_idx += size; // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector); diff --git a/test/half_float.cpp b/test/half_float.cpp index 729de1bc725d2b0d4db43c7f9e48f90de92b50be..ffb3215b9d85fcd574122654a907ddfd2d08d923 100644 --- a/test/half_float.cpp +++ b/test/half_float.cpp @@ -224,6 +224,8 @@ void test_comparison() void test_basic_functions() { + const float PI = static_cast(EIGEN_PI); + VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(abs(half(3.5f))), 3.5f); VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f); @@ -251,8 +253,8 @@ void test_basic_functions() VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f); VERIFY_IS_EQUAL(float(exp(half(0.0f))), 1.0f); - VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI)); - VERIFY_IS_APPROX(float(exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI)); + VERIFY_IS_APPROX(float(numext::exp(half(PI))), 20.f + PI); + VERIFY_IS_APPROX(float(exp(half(PI))), 20.f + PI); VERIFY_IS_EQUAL(float(numext::expm1(half(0.0f))), 0.0f); VERIFY_IS_EQUAL(float(expm1(half(0.0f))), 0.0f); @@ -277,25 +279,26 @@ void test_basic_functions() void test_trigonometric_functions() { + const float PI = static_cast(EIGEN_PI); VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f))); VERIFY_IS_APPROX(cos(half(0.0f)), half(cosf(0.0f))); - VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI))); - // VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2))); - // VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2))); + VERIFY_IS_APPROX(numext::cos(half(PI)), half(cosf(PI))); + // VERIFY_IS_APPROX(numext::cos(half(PI/2)), half(cosf(PI/2))); + // VERIFY_IS_APPROX(numext::cos(half(3*PI/2)), half(cosf(3*PI/2))); VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f))); VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f))); VERIFY_IS_APPROX(sin(half(0.0f)), half(sinf(0.0f))); - // VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI))); - VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2))); - VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2))); + // VERIFY_IS_APPROX(numext::sin(half(PI)), half(sinf(PI))); + VERIFY_IS_APPROX(numext::sin(half(PI/2)), half(sinf(PI/2))); + VERIFY_IS_APPROX(numext::sin(half(3*PI/2)), half(sinf(3*PI/2))); VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f))); VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f))); VERIFY_IS_APPROX(tan(half(0.0f)), half(tanf(0.0f))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI))); - // VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2))); - //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2))); + // VERIFY_IS_APPROX(numext::tan(half(PI)), half(tanf(PI))); + // VERIFY_IS_APPROX(numext::tan(half(PI/2)), half(tanf(PI/2))); + //VERIFY_IS_APPROX(numext::tan(half(3*PI/2)), half(tanf(3*PI/2))); VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f))); } diff --git a/test/inverse.cpp b/test/inverse.cpp index 99f9e0c9b238fd280c89ea4df93a397f89958499..9cedfa1e1069ce7a15e4056a6ab95fe4cccc7405 100644 --- a/test/inverse.cpp +++ b/test/inverse.cpp @@ -135,6 +135,8 @@ EIGEN_DECLARE_TEST(inverse) CALL_SUBTEST_5( inverse(MatrixXf(s,s)) ); TEST_SET_BUT_UNUSED_VARIABLE(s) CALL_SUBTEST_5( inverse_zerosized() ); + CALL_SUBTEST_5( inverse(MatrixXf(0, 0)) ); + CALL_SUBTEST_5( inverse(MatrixXf(1, 1)) ); s = internal::random(25,100); CALL_SUBTEST_6( inverse(MatrixXcd(s,s)) ); diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp index 89484d97106a17c80e537db61d7060c257f2c8ce..5b15c5a27bff8996be7ab66dc49581fcfd226840 100644 --- a/test/jacobisvd.cpp +++ b/test/jacobisvd.cpp @@ -36,6 +36,9 @@ void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true) template void jacobisvd_verify_assert(const MatrixType& m) { svd_verify_assert >(m); + svd_verify_assert >(m, true); + svd_verify_assert >(m); + svd_verify_assert >(m); Index rows = m.rows(); Index cols = m.cols(); diff --git a/test/main.h b/test/main.h index 07f3794ac8be407cc859bc44ae2afa14902d8d21..19bbf1b8191d7b0e07c23ffe506e8ef6c351f6ee 100644 --- a/test/main.h +++ b/test/main.h @@ -111,6 +111,9 @@ struct imag {}; // `I` may be defined by complex.h: #define I FORBIDDEN_IDENTIFIER +// _res is defined by resolv.h +#define _res FORBIDDEN_IDENTIFIER + // Unit tests calling Eigen's blas library must preserve the default blocking size // to avoid troubles. #ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS @@ -391,6 +394,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file, #define VERIFY_IS_NOT_MUCH_SMALLER_THAN(a, b) VERIFY(!test_isMuchSmallerThan(a, b)) #define VERIFY_IS_APPROX_OR_LESS_THAN(a, b) VERIFY(test_isApproxOrLessThan(a, b)) #define VERIFY_IS_NOT_APPROX_OR_LESS_THAN(a, b) VERIFY(!test_isApproxOrLessThan(a, b)) +#define VERIFY_IS_CWISE_EQUAL(a, b) VERIFY(verifyIsCwiseApprox(a, b, true)) +#define VERIFY_IS_CWISE_APPROX(a, b) VERIFY(verifyIsCwiseApprox(a, b, false)) #define VERIFY_IS_UNITARY(a) VERIFY(test_isUnitary(a)) @@ -422,7 +427,13 @@ template<> inline long double test_precision >() { ret #define EIGEN_TEST_SCALAR_TEST_OVERLOAD(TYPE) \ inline bool test_isApprox(TYPE a, TYPE b) \ - { return internal::isApprox(a, b, test_precision()); } \ + { return numext::equal_strict(a, b) || \ + ((numext::isnan)(a) && (numext::isnan)(b)) || \ + (internal::isApprox(a, b, test_precision())); } \ + inline bool test_isCwiseApprox(TYPE a, TYPE b, bool exact) \ + { return numext::equal_strict(a, b) || \ + ((numext::isnan)(a) && (numext::isnan)(b)) || \ + (!exact && internal::isApprox(a, b, test_precision())); } \ inline bool test_isMuchSmallerThan(TYPE a, TYPE b) \ { return internal::isMuchSmallerThan(a, b, test_precision()); } \ inline bool test_isApproxOrLessThan(TYPE a, TYPE b) \ @@ -592,6 +603,22 @@ inline bool verifyIsApprox(const Type1& a, const Type2& b) return ret; } +// verifyIsCwiseApprox is a wrapper to test_isCwiseApprox that outputs the relative difference magnitude if the test fails. +template +inline bool verifyIsCwiseApprox(const Type1& a, const Type2& b, bool exact) +{ + bool ret = test_isCwiseApprox(a,b,exact); + if(!ret) { + if (exact) { + std::cerr << "Values are not an exact match"; + } else { + std::cerr << "Difference too large wrt tolerance " << get_test_precision(a); + } + std::cerr << ", relative error is: " << test_relative_error(a,b) << std::endl; + } + return ret; +} + // The idea behind this function is to compare the two scalars a and b where // the scalar ref is a hint about the expected order of magnitude of a and b. // WARNING: the scalar a and b must be positive diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp index d450dbff8bbfd324e5b7b19b0329dbebef52ba45..2af7b888747598d546615e2d92de8ff3a8deb937 100644 --- a/test/mixingtypes.cpp +++ b/test/mixingtypes.cpp @@ -139,11 +139,12 @@ template void mixingtypes(int size = SizeAtCompileType) VERIFY_MIX_SCALAR(scd - vd.array() , scd - vd.template cast >().array()); // check scalar powers - VERIFY_MIX_SCALAR( pow(vcf.array(), sf), Eigen::pow(vcf.array(), complex(sf)) ); - VERIFY_MIX_SCALAR( vcf.array().pow(sf) , Eigen::pow(vcf.array(), complex(sf)) ); + // NOTE: scalar exponents use a unary op. + VERIFY_IS_APPROX( pow(vcf.array(), sf), Eigen::pow(vcf.array(), complex(sf)) ); + VERIFY_IS_APPROX( vcf.array().pow(sf) , Eigen::pow(vcf.array(), complex(sf)) ); VERIFY_MIX_SCALAR( pow(sd, vcd.array()), Eigen::pow(complex(sd), vcd.array()) ); - VERIFY_MIX_SCALAR( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast >().array(), scf) ); - VERIFY_MIX_SCALAR( vf.array().pow(scf) , Eigen::pow(vf.template cast >().array(), scf) ); + VERIFY_IS_APPROX( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast >().array(), scf) ); + VERIFY_IS_APPROX( vf.array().pow(scf) , Eigen::pow(vf.template cast >().array(), scf) ); VERIFY_MIX_SCALAR( Eigen::pow(scd, vd.array()), Eigen::pow(scd, vd.template cast >().array()) ); // check dot product diff --git a/test/nestbyvalue.cpp b/test/nestbyvalue.cpp index c5356bc24cb8e571309b5cb674aecc41602fedd2..3a86bea5057e7da82d7a9e79ed780b8ebbfd44a0 100644 --- a/test/nestbyvalue.cpp +++ b/test/nestbyvalue.cpp @@ -26,7 +26,7 @@ EIGEN_DECLARE_TEST(nestbyvalue) for(int i = 0; i < g_repeat; i++) { Index rows = internal::random(1,EIGEN_TEST_MAX_SIZE); Index cols = internal::random(1,EIGEN_TEST_MAX_SIZE); - MatrixXd a = MatrixXd(rows,cols); + MatrixXd a = MatrixXd::Random(rows,cols); nb_temporaries = 0; XprType x = get_xpr_with_temps(a); VERIFY_IS_EQUAL(nb_temporaries,6); diff --git a/test/numext.cpp b/test/numext.cpp index cf1ca173dd66f3abc49974df24f09843fedcd2db..8a2fde5015bc50baaf41d0c3b3eb8c2662626f20 100644 --- a/test/numext.cpp +++ b/test/numext.cpp @@ -61,6 +61,20 @@ void check_abs() { } } +template +void check_arg() { + typedef typename NumTraits::Real Real; + VERIFY_IS_EQUAL(numext::abs(T(0)), T(0)); + VERIFY_IS_EQUAL(numext::abs(T(1)), T(1)); + + for(int k=0; k<100; ++k) + { + T x = internal::random(); + Real y = numext::arg(x); + VERIFY_IS_APPROX( y, std::arg(x) ); + } +} + template struct check_sqrt_impl { static void run() { @@ -242,10 +256,12 @@ EIGEN_DECLARE_TEST(numext) { CALL_SUBTEST( check_abs() ); CALL_SUBTEST( check_abs() ); CALL_SUBTEST( check_abs() ); - CALL_SUBTEST( check_abs >() ); CALL_SUBTEST( check_abs >() ); + CALL_SUBTEST( check_arg >() ); + CALL_SUBTEST( check_arg >() ); + CALL_SUBTEST( check_sqrt() ); CALL_SUBTEST( check_sqrt() ); CALL_SUBTEST( check_sqrt >() ); diff --git a/test/packetmath.cpp b/test/packetmath.cpp index 67d329a67ff03bdb5aa633a2e8f46c5b88c89ee0..518b801b9f274448cc3e51dfaed5d1ca7fb11575 100644 --- a/test/packetmath.cpp +++ b/test/packetmath.cpp @@ -52,6 +52,12 @@ inline T REF_FREXP(const T& x, T& exp) { EIGEN_USING_STD(frexp) const T out = static_cast(frexp(x, &iexp)); exp = static_cast(iexp); + + // The exponent value is unspecified if the input is inf or NaN, but MSVC + // seems to set it to 1. We need to set it back to zero for consistency. + if (!(numext::isfinite)(x)) { + exp = T(0); + } return out; } @@ -279,19 +285,84 @@ void packetmath_boolean_mask_ops() { CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq); } +template +void packetmath_boolean_mask_ops_real() { + const int PacketSize = internal::unpacket_traits::size; + const int size = 2 * PacketSize; + EIGEN_ALIGN_MAX Scalar data1[size]; + EIGEN_ALIGN_MAX Scalar data2[size]; + EIGEN_ALIGN_MAX Scalar ref[size]; + + for (int i = 0; i < PacketSize; ++i) { + data1[i] = internal::random(); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + + CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan); + + //Test (-0) <=/< (0) for signed operations + for (int i = 0; i < PacketSize; ++i) { + data1[i] = Scalar(-0.0); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan); + + //Test NaN + for (int i = 0; i < PacketSize; ++i) { + data1[i] = NumTraits::quiet_NaN(); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan); +} + +template +void packetmath_boolean_mask_ops_notcomplex() { + const int PacketSize = internal::unpacket_traits::size; + const int size = 2 * PacketSize; + EIGEN_ALIGN_MAX Scalar data1[size]; + EIGEN_ALIGN_MAX Scalar data2[size]; + EIGEN_ALIGN_MAX Scalar ref[size]; + + for (int i = 0; i < PacketSize; ++i) { + data1[i] = internal::random(); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + + CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le); + CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt); + + //Test (-0) <=/< (0) for signed operations + for (int i = 0; i < PacketSize; ++i) { + data1[i] = Scalar(-0.0); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le); + CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt); + + //Test NaN + for (int i = 0; i < PacketSize; ++i) { + data1[i] = NumTraits::quiet_NaN(); + data1[i + PacketSize] = internal::random() ? data1[i] : Scalar(0); + } + CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le); + CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt); +} + // Packet16b representing bool does not support ptrue, pandnot or pcmp_eq, since the scalar path // (for some compilers) compute the bitwise and with 0x1 of the results to keep the value in [0,1]. template<> void packetmath_boolean_mask_ops::type>() {} +template<> +void packetmath_boolean_mask_ops_notcomplex::type>() {} template void packetmath_minus_zero_add() { const int PacketSize = internal::unpacket_traits::size; const int size = 2 * PacketSize; - EIGEN_ALIGN_MAX Scalar data1[size]; - EIGEN_ALIGN_MAX Scalar data2[size]; - EIGEN_ALIGN_MAX Scalar ref[size]; - + EIGEN_ALIGN_MAX Scalar data1[size] = {}; + EIGEN_ALIGN_MAX Scalar data2[size] = {}; + EIGEN_ALIGN_MAX Scalar ref[size] = {}; + for (int i = 0; i < PacketSize; ++i) { data1[i] = Scalar(-0.0); data1[i + PacketSize] = Scalar(-0.0); @@ -453,9 +524,7 @@ void packetmath() { for (int i = 0; i < PacketSize; ++i) ref[0] += data1[i]; VERIFY(test::isApproxAbs(ref[0], internal::predux(internal::pload(data1)), refvalue) && "internal::predux"); - if (PacketSize == 8 && internal::unpacket_traits::half>::size == - 4) // so far, predux_half_downto4 is only required in such a case - { + if (!internal::is_same::half>::value) { int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize; for (int i = 0; i < HalfPacketSize; ++i) ref[i] = Scalar(0); for (int i = 0; i < PacketSize; ++i) ref[i % HalfPacketSize] += data1[i]; @@ -483,6 +552,27 @@ void packetmath() { } } + // GeneralBlockPanelKernel also checks PacketBlock; + if (PacketSize > 4 && PacketSize % 4 == 0) { + internal::PacketBlock kernel2; + for (int i = 0; i < 4; ++i) { + kernel2.packet[i] = internal::pload(data1 + i * PacketSize); + } + ptranspose(kernel2); + int data_counter = 0; + for (int i = 0; i < PacketSize; ++i) { + for (int j = 0; j < 4; ++j) { + data2[data_counter++] = data1[j*PacketSize + i]; + } + } + for (int i = 0; i < 4; ++i) { + internal::pstore(data3, kernel2.packet[i]); + for (int j = 0; j < PacketSize; ++j) { + VERIFY(test::isApproxAbs(data3[j], data2[i*PacketSize + j], refvalue) && "ptranspose"); + } + } + } + if (PacketTraits::HasBlend) { Packet thenPacket = internal::pload(data1); Packet elsePacket = internal::pload(data2); @@ -547,9 +637,19 @@ void packetmath_real() { const int PacketSize = internal::unpacket_traits::size; const int size = PacketSize * 4; - EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4]; - EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4]; - EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4]; + EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4] = {}; + EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4] = {}; + EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4] = {}; + + // Negate with -0. + if (PacketTraits::HasNegate) { + test::packet_helper h; + data1[0] = Scalar(-0); + h.store(data2, internal::pnegate(h.load(data1))); + typedef typename internal::make_unsigned::type>::type Bits; + Bits bits = numext::bit_cast(data2[0]); + VERIFY_IS_EQUAL(bits, static_cast(Bits(1)<<(sizeof(Scalar)*CHAR_BIT - 1))); + } for (int i = 0; i < size; ++i) { data1[i] = Scalar(internal::random(0, 1) * std::pow(10., internal::random(-6, 6))); @@ -574,6 +674,8 @@ void packetmath_real() { CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil); CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor); CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print); + + packetmath_boolean_mask_ops_real(); // Rounding edge cases. if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) { @@ -1020,6 +1122,8 @@ void packetmath_notcomplex() { CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_nan_min, (internal::pmin)); CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_nan_max, internal::pmax); } + + packetmath_boolean_mask_ops_notcomplex(); } template diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp index 07246646793ff97b047160428f1f5024ce0305ef..86f057118cf1c3ba1e906eb7fd6160c4cbd4636f 100644 --- a/test/prec_inverse_4x4.cpp +++ b/test/prec_inverse_4x4.cpp @@ -30,18 +30,17 @@ template void inverse_general_4x4(int repeat) { using std::abs; typedef typename MatrixType::Scalar Scalar; - typedef typename MatrixType::RealScalar RealScalar; double error_sum = 0., error_max = 0.; for(int i = 0; i < repeat; ++i) { MatrixType m; - RealScalar absdet; + bool is_invertible; do { m = MatrixType::Random(); - absdet = abs(m.determinant()); - } while(absdet < NumTraits::epsilon()); + is_invertible = Eigen::FullPivLU(m).isInvertible(); + } while(!is_invertible); MatrixType inv = m.inverse(); - double error = double( (m*inv-MatrixType::Identity()).norm() * absdet / NumTraits::epsilon() ); + double error = double( (m*inv-MatrixType::Identity()).norm()); error_sum += error; error_max = (std::max)(error_max, error); } diff --git a/test/product_small.cpp b/test/product_small.cpp index 1d6df6e58765451d0bbf58508111816a85b75620..fec7f5658f9b6c920740e5ac8aec7a91a3beddda 100644 --- a/test/product_small.cpp +++ b/test/product_small.cpp @@ -70,7 +70,7 @@ void test_dynamic_bool() for(Index i=0;i +// Copyright (C) 2013 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed diff --git a/test/reshape.cpp b/test/reshape.cpp index 7b16742a23faeaafadad3fc95dec7a9918c0db7b..1f966ebaee6e4e4bed8223df6953005d7e2ca190 100644 --- a/test/reshape.cpp +++ b/test/reshape.cpp @@ -193,6 +193,24 @@ void reshape4x4(MatType m) } } +template +void reshape_block(const BlockType& M) { + typename BlockType::PlainObject dense = M.eval(); + Index rows = M.size() / 2; + Index cols = M.size() / rows; + VERIFY_IS_EQUAL(dense.reshaped(rows, cols), M.reshaped(rows, cols)); + + for (Index i=0; i RowMatrixXi; @@ -213,4 +231,5 @@ EIGEN_DECLARE_TEST(reshape) CALL_SUBTEST(reshape4x4(rmx)); CALL_SUBTEST(reshape4x4(rm4)); + CALL_SUBTEST(reshape_block(rm4.col(1))); } diff --git a/test/rvalue_types.cpp b/test/rvalue_types.cpp index c20a32f790afbe62ff7240988731912a94896f46..2c9999ce80851e46579b405669bf8d912a77c6bf 100644 --- a/test/rvalue_types.cpp +++ b/test/rvalue_types.cpp @@ -13,41 +13,12 @@ #if EIGEN_HAS_CXX11 #include "MovableScalar.h" #endif +#include "SafeScalar.h" #include using internal::UIntPtr; -// A Scalar that asserts for uninitialized access. -template -class SafeScalar { - public: - SafeScalar() : initialized_(false) {} - SafeScalar(const SafeScalar& other) { - *this = other; - } - SafeScalar& operator=(const SafeScalar& other) { - val_ = T(other); - initialized_ = true; - return *this; - } - - SafeScalar(T val) : val_(val), initialized_(true) {} - SafeScalar& operator=(T val) { - val_ = val; - initialized_ = true; - } - - operator T() const { - VERIFY(initialized_ && "Uninitialized access."); - return val_; - } - - private: - T val_; - bool initialized_; -}; - #if EIGEN_HAS_RVALUE_REFERENCES template void rvalue_copyassign(const MatrixType& m) diff --git a/test/schur_complex.cpp b/test/schur_complex.cpp index 03e17e81defd09f67f87866a34ba684aa629f5d6..26acb8c3ac3f67618bfe6363a76fe205e77eb9d3 100644 --- a/test/schur_complex.cpp +++ b/test/schur_complex.cpp @@ -54,7 +54,8 @@ template void schur(int size = MatrixType::ColsAtCompileTim VERIFY_IS_EQUAL(cs3.matrixT(), cs1.matrixT()); VERIFY_IS_EQUAL(cs3.matrixU(), cs1.matrixU()); cs3.setMaxIterations(1).compute(A); - VERIFY_IS_EQUAL(cs3.info(), size > 1 ? NoConvergence : Success); + // The schur decomposition does often converge with a single iteration. + // VERIFY_IS_EQUAL(cs3.info(), size > 1 ? NoConvergence : Success); VERIFY_IS_EQUAL(cs3.getMaxIterations(), 1); MatrixType Atriangular = A; diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp index f9668102c431dcda56f2ae2cda1ebc829814899f..b4905b0531743a5d3eefa066d6994382b5eae4ec 100644 --- a/test/sparse_block.cpp +++ b/test/sparse_block.cpp @@ -315,8 +315,9 @@ EIGEN_DECLARE_TEST(sparse_block) CALL_SUBTEST_4(( sparse_block(SparseMatrix(short(r), short(c))) )); CALL_SUBTEST_4(( sparse_block(SparseMatrix(short(r), short(c))) )); - +#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW AnnoyingScalar::dont_throw = true; +#endif CALL_SUBTEST_5(( sparse_block(SparseMatrix(r,c)) )); } } diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp index 12b6f8a9dd7b66b6b9f9badb105e54e45bb50178..8f33af858ca56e8e0d32625acc5c03ac7581f75f 100644 --- a/test/sparse_ref.cpp +++ b/test/sparse_ref.cpp @@ -1,7 +1,7 @@ // This file is part of Eigen, a lightweight C++ template library // for linear algebra. // -// Copyright (C) 20015 Gael Guennebaud +// Copyright (C) 2015 Gael Guennebaud // // This Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed diff --git a/test/sparse_solver.h b/test/sparse_solver.h index 58927944bdd98ac86133025c79a9437daefe9018..6f95e2fa7a9df00d41aafd890c6073df9100536f 100644 --- a/test/sparse_solver.h +++ b/test/sparse_solver.h @@ -99,6 +99,13 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A, VERIFY(solver.info() == Success && "solving failed when using Map"); VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!"); VERIFY(xm.isApprox(refX,test_precision())); + + // Test with a Map and non-unit stride. + Eigen::Matrix out(2*xm.rows(), 2*xm.cols()); + out.setZero(); + Eigen::Map > outm(out.data(), xm.rows(), xm.cols(), Stride(2 * xm.rows(), 2)); + outm = solver.solve(bm); + VERIFY(outm.isApprox(refX,test_precision())); } // if not too large, do some extra check: diff --git a/test/stdlist_overload.cpp b/test/stdlist_overload.cpp index a78516e24eba1a093a0141cfbf3a3b6ed8b383ca..843e28c3cbf87a3e521d07f28c6c5d5fe58d8756 100644 --- a/test/stdlist_overload.cpp +++ b/test/stdlist_overload.cpp @@ -63,7 +63,7 @@ void check_stdlist_matrix(const MatrixType& m) ++itw; } - v.resize(21); + v.resize(21, MatrixType::Zero(rows, cols)); set(v, 20, x); VERIFY_IS_APPROX(*get(v, 20), x); v.resize(22,y); diff --git a/test/stdvector.cpp b/test/stdvector.cpp index 18de240c6e68f490b590808cef56ad0515fc6ff2..9c023d656483fc6914c1ab15b0c549adefb3d34d 100644 --- a/test/stdvector.cpp +++ b/test/stdvector.cpp @@ -52,7 +52,7 @@ void check_stdvector_transform(const TransformType&) { typedef typename TransformType::MatrixType MatrixType; TransformType x(MatrixType::Random()), y(MatrixType::Random()); - std::vector > v(10), w(20, y); + std::vector > v(10, TransformType(MatrixType::Zero())), w(20, y); v[5] = x; w[6] = v[5]; VERIFY_IS_APPROX(w[6], v[5]); @@ -124,7 +124,7 @@ void std_vector_gcc_warning() { typedef Eigen::Vector3f T; std::vector > v; - v.push_back(T()); + v.push_back(T::Zero()); } EIGEN_DECLARE_TEST(stdvector) diff --git a/test/svd_common.h b/test/svd_common.h index bd62edcc8442c8375e6551c4b4e449f3d83581bb..eae4c0bfe31e98b7e3cbe157768300f813220cee 100644 --- a/test/svd_common.h +++ b/test/svd_common.h @@ -462,7 +462,7 @@ void svd_preallocate() } template -void svd_verify_assert(const MatrixType& m) +void svd_verify_assert(const MatrixType& m, bool fullOnly = false) { typedef typename MatrixType::Scalar Scalar; Index rows = m.rows(); @@ -489,8 +489,17 @@ void svd_verify_assert(const MatrixType& m) VERIFY_RAISES_ASSERT(svd.matrixV()) svd.singularValues(); VERIFY_RAISES_ASSERT(svd.solve(rhs)) - - if (ColsAtCompileTime == Dynamic) + + svd.compute(a, ComputeFullU); + svd.matrixU(); + VERIFY_RAISES_ASSERT(svd.matrixV()) + VERIFY_RAISES_ASSERT(svd.solve(rhs)) + svd.compute(a, ComputeFullV); + svd.matrixV(); + VERIFY_RAISES_ASSERT(svd.matrixU()) + VERIFY_RAISES_ASSERT(svd.solve(rhs)) + + if (!fullOnly && ColsAtCompileTime == Dynamic) { svd.compute(a, ComputeThinU); svd.matrixU(); diff --git a/test/symbolic_index.cpp b/test/symbolic_index.cpp index b114cbb9565eb8ff4a403e918f37211123d7214a..a75ca1165ae5888c97714fecf3d3f8408672dc19 100644 --- a/test/symbolic_index.cpp +++ b/test/symbolic_index.cpp @@ -58,15 +58,15 @@ void check_symbolic_index() VERIFY( is_same_type( fix<9>()/2, int(9/2) ) ); VERIFY( is_same_symb( lastp1-1, last, size) ); - VERIFY( is_same_symb( lastp1-fix<1>, last, size) ); + VERIFY( is_same_symb( lastp1-fix<1>(), last, size) ); VERIFY_IS_EQUAL( ( (last*5-2)/3 ).eval(last=size-1), ((size-1)*5-2)/3 ); - VERIFY_IS_EQUAL( ( (last*fix<5>-fix<2>)/fix<3> ).eval(last=size-1), ((size-1)*5-2)/3 ); + VERIFY_IS_EQUAL( ( (last*fix<5>()-fix<2>())/fix<3>() ).eval(last=size-1), ((size-1)*5-2)/3 ); VERIFY_IS_EQUAL( ( -last*lastp1 ).eval(last=size-1), -(size-1)*size ); VERIFY_IS_EQUAL( ( lastp1-3*last ).eval(last=size-1), size- 3*(size-1) ); VERIFY_IS_EQUAL( ( (lastp1-3*last)/lastp1 ).eval(last=size-1), (size- 3*(size-1))/size ); -#if EIGEN_HAS_CXX14 +#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES { struct x_tag {}; static const symbolic::SymbolExpr x; struct y_tag {}; static const symbolic::SymbolExpr y; diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp deleted file mode 100644 index 120cc42bbf45945fc9761ef1074204be2cbea8ab..0000000000000000000000000000000000000000 --- a/test/unalignedassert.cpp +++ /dev/null @@ -1,180 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008 Benoit Jacob -// Copyright (C) 2015 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#if defined(EIGEN_TEST_PART_1) - // default -#elif defined(EIGEN_TEST_PART_2) - #define EIGEN_MAX_STATIC_ALIGN_BYTES 16 - #define EIGEN_MAX_ALIGN_BYTES 16 -#elif defined(EIGEN_TEST_PART_3) - #define EIGEN_MAX_STATIC_ALIGN_BYTES 32 - #define EIGEN_MAX_ALIGN_BYTES 32 -#elif defined(EIGEN_TEST_PART_4) - #define EIGEN_MAX_STATIC_ALIGN_BYTES 64 - #define EIGEN_MAX_ALIGN_BYTES 64 -#endif - -#include "main.h" - -typedef Matrix Vector6f; -typedef Matrix Vector8f; -typedef Matrix Vector12f; - -typedef Matrix Vector5d; -typedef Matrix Vector6d; -typedef Matrix Vector7d; -typedef Matrix Vector8d; -typedef Matrix Vector9d; -typedef Matrix Vector10d; -typedef Matrix Vector12d; - -struct TestNew1 -{ - MatrixXd m; // good: m will allocate its own array, taking care of alignment. - TestNew1() : m(20,20) {} -}; - -struct TestNew2 -{ - Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned, - // 8-byte alignment is good enough here, which we'll get automatically -}; - -struct TestNew3 -{ - Vector2f m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned -}; - -struct TestNew4 -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW - Vector2d m; - float f; // make the struct have sizeof%16!=0 to make it a little more tricky when we allow an array of 2 such objects -}; - -struct TestNew5 -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW - float f; // try the f at first -- the EIGEN_ALIGN_MAX attribute of m should make that still work - Matrix4f m; -}; - -struct TestNew6 -{ - Matrix m; // good: no alignment requested - float f; -}; - -template struct Depends -{ - EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(Align) - Vector2d m; - float f; -}; - -template -void check_unalignedassert_good() -{ - T *x, *y; - x = new T; - delete x; - y = new T[2]; - delete[] y; -} - -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 -template -void construct_at_boundary(int boundary) -{ - char buf[sizeof(T)+256]; - size_t _buf = reinterpret_cast(buf); - _buf += (EIGEN_MAX_ALIGN_BYTES - (_buf % EIGEN_MAX_ALIGN_BYTES)); // make 16/32/...-byte aligned - _buf += boundary; // make exact boundary-aligned - T *x = ::new(reinterpret_cast(_buf)) T; - x[0].setZero(); // just in order to silence warnings - x->~T(); -} -#endif - -void unalignedassert() -{ -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 - construct_at_boundary(4); - construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(16); - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(4); - construct_at_boundary(16); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - - construct_at_boundary(16); - construct_at_boundary(4); - construct_at_boundary(EIGEN_MAX_ALIGN_BYTES); - construct_at_boundary(16); -#endif - - check_unalignedassert_good(); - check_unalignedassert_good(); - check_unalignedassert_good(); - - check_unalignedassert_good(); - check_unalignedassert_good(); - check_unalignedassert_good(); - check_unalignedassert_good >(); - -#if EIGEN_MAX_STATIC_ALIGN_BYTES>0 - if(EIGEN_MAX_ALIGN_BYTES>=16) - { - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - // Complexes are disabled because the compiler might aggressively vectorize - // the initialization of complex coeffs to 0 before we can check for alignedness - //VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - VERIFY_RAISES_ASSERT(construct_at_boundary(8)); - } - for(int b=8; b(b)); - if(b<64) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - if(b<32) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - if(b<32) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - if(b<128) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - //if(b<32) VERIFY_RAISES_ASSERT(construct_at_boundary(b)); - } -#endif -} - -EIGEN_DECLARE_TEST(unalignedassert) -{ - CALL_SUBTEST(unalignedassert()); -} diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp index 7a853888b4117d9663b96093ae47a67f893b4f6b..19375191d5de4a6e172e30d6e5a891965fc48121 100644 --- a/test/vectorization_logic.cpp +++ b/test/vectorization_logic.cpp @@ -39,11 +39,15 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling) { EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src); typedef internal::copy_using_evaluator_traits,internal::evaluator, internal::assign_op > traits; - bool res = traits::Traversal==traversal; - if(unrolling==InnerUnrolling+CompleteUnrolling) - res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling); - else - res = res && int(traits::Unrolling)==unrolling; + // If traversal or unrolling are negative, ignore. + bool res = traversal > -1 ? traits::Traversal==traversal : true; + if (unrolling > -1) { + if(unrolling==InnerUnrolling+CompleteUnrolling) { + res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling); + } else { + res = res && int(traits::Unrolling)==unrolling; + } + } if(!res) { std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl; @@ -159,11 +163,11 @@ struct vectorization_logic EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling)); VERIFY(test_assign(Matrix1(),Matrix1()+Matrix1(), - (Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal, + (int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal, CompleteUnrolling)); VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(), - EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) + EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal, CompleteUnrolling)); VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3), @@ -178,21 +182,15 @@ struct vectorization_logic typedef Matrix Vector3; VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1), LinearTraversal,CompleteUnrolling)); - VERIFY(test_assign(Vector3(),Vector3()+Vector3(), - sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal), CompleteUnrolling)); - VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), - EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal) - : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal), - ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling)); + // Vectorization depends on too many factors - ignore. + VERIFY(test_assign(Vector3(),Vector3()+Vector3(), -1, CompleteUnrolling)); VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()), LinearVectorizedTraversal,CompleteUnrolling)); + // Vectorization depends on too many factors - ignore. VERIFY(test_assign(Matrix(),Matrix()+Matrix(), - sizeof(Scalar)==16 ? InnerVectorizedTraversal : - EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : - LinearTraversal, - NoUnrolling)); + -1, NoUnrolling)); VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling)); @@ -277,12 +275,20 @@ struct vectorization_logic_half }; static void run() { + // Some half-packets have a byte size < EIGEN_MIN_ALIGN_BYTES (e.g. Packet2f), + // which causes many of these tests to fail since they don't vectorize if + // EIGEN_UNALIGNED_VECTORIZE is 0 (the matrix is assumed unaligned). + // Adjust the matrix sizes to account for these alignment issues. + enum { PacketBytes = sizeof(Scalar)*PacketSize }; + enum { MinVSize = EIGEN_UNALIGNED_VECTORIZE ? int(PacketSize) + : int(PacketBytes) >= EIGEN_MIN_ALIGN_BYTES ? int(PacketSize) + : (EIGEN_MIN_ALIGN_BYTES + sizeof(Scalar) - 1) / sizeof(Scalar) }; - typedef Matrix Vector1; - typedef Matrix Matrix11; - typedef Matrix Matrix57; - typedef Matrix Matrix35; - typedef Matrix Matrix57u; + typedef Matrix Vector1; + typedef Matrix Matrix11; + typedef Matrix Matrix57; + typedef Matrix Matrix35; + typedef Matrix Matrix57u; typedef Matrix Matrix3; - #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT +#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT VERIFY(test_assign(Vector1(),Vector1(), InnerVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Vector1(),Vector1()+Vector1(), InnerVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_assign(Vector1(),Vector1().template segment(0).derived(), + VERIFY(test_assign(Vector1(),Vector1().template segment(0).derived(), EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Vector1(),Scalar(2.1)*Vector1()-Vector1(), InnerVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_assign(Vector1(),(Scalar(2.1)*Vector1().template segment(0)-Vector1().template segment(0)).derived(), + VERIFY(test_assign(Vector1(),(Scalar(2.1)*Vector1().template segment(0)-Vector1().template segment(0)).derived(), EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()), InnerVectorizedTraversal,CompleteUnrolling)); @@ -324,26 +330,23 @@ struct vectorization_logic_half EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling)); VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(), - EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling)); + EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling)); if(PacketSize>1) { typedef Matrix Matrix33c; VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1), LinearTraversal,CompleteUnrolling)); - VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1), - EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal) - : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal), - ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling)); - + + // Unrolling depends on read costs and unroll limits, which vary - ignore. VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()), - PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling)); + PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal, -1)); VERIFY(test_assign(Matrix(),Matrix()+Matrix(), sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal), NoUnrolling)); - VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), + VERIFY(test_assign(Matrix11(),Matrix().template block(2,3)+Matrix().template block(8,4), EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,InnerUnrolling+CompleteUnrolling)); @@ -357,7 +360,7 @@ struct vectorization_logic_half VERIFY(test_redux(Vector1(), LinearVectorizedTraversal,CompleteUnrolling)); - VERIFY(test_redux(Matrix(), + VERIFY(test_redux(Matrix(), LinearVectorizedTraversal,CompleteUnrolling)); VERIFY(test_redux(Matrix3(), @@ -379,9 +382,9 @@ struct vectorization_logic_half Matrix >(DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling))); - VERIFY((test_assign(Matrix57(), Matrix()*Matrix(), - InnerVectorizedTraversal, InnerUnrolling+CompleteUnrolling))); - #endif + VERIFY((test_assign(Matrix57(), Matrix() * Matrix(), + InnerVectorizedTraversal, InnerUnrolling + CompleteUnrolling))); +#endif } }; diff --git a/unsupported/CMakeLists.txt b/unsupported/CMakeLists.txt index 34408c017004409072f71b860477790cb767dadf..67d1f6262a494fca6d222498e88430d20df4becc 100644 --- a/unsupported/CMakeLists.txt +++ b/unsupported/CMakeLists.txt @@ -2,7 +2,7 @@ add_subdirectory(Eigen) if(EIGEN_BUILD_DOC) add_subdirectory(doc EXCLUDE_FROM_ALL) endif() -if(BUILD_TESTING) +if(EIGEN_BUILD_TESTING) if(EIGEN_LEAVE_TEST_IN_ALL_TARGET) add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest else() diff --git a/unsupported/Eigen/AutoDiff b/unsupported/Eigen/AutoDiff index 7a4ff460ce505d1801af0ef383525b1beb5bb628..e10875e71cf212b31156cbd3fb9b1facc6656c0f 100644 --- a/unsupported/Eigen/AutoDiff +++ b/unsupported/Eigen/AutoDiff @@ -10,6 +10,8 @@ #ifndef EIGEN_AUTODIFF_MODULE #define EIGEN_AUTODIFF_MODULE +#include "../../Eigen/Core" + namespace Eigen { /** diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor index d73c6008d92f2a91e603b8d1e92f8cf2ec7116ea..0938bb554da43256277e42971b355a242f96cf4f 100644 --- a/unsupported/Eigen/CXX11/Tensor +++ b/unsupported/Eigen/CXX11/Tensor @@ -41,14 +41,6 @@ #include #include -#ifdef _WIN32 -#include -#elif defined(__APPLE__) -#include -#else -#include -#endif - #if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) #include "ThreadPool" #endif diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md index 9b6f14204e06064174aaf3f3520796ed7c9f287e..2f65b1b0e9f60a4c3ac6baabdeb68d6d15fd1ca6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/README.md +++ b/unsupported/Eigen/CXX11/src/Tensor/README.md @@ -3,8 +3,6 @@ Tensors are multidimensional arrays of elements. Elements are typically scalars, but more complex types such as strings are also supported. -[TOC] - ## Tensor Classes You can manipulate a tensor with one of the following classes. They all are in @@ -21,7 +19,7 @@ matrix. Tensors of this class are resizable. For example, if you assign a tensor of a different size to a Tensor, that tensor is resized to match its new value. -#### Constructor `Tensor(size0, size1, ...)` +#### Constructor Tensor(size0, size1, ...) Constructor for a Tensor. The constructor must be passed `rank` integers indicating the sizes of the instance along each of the the `rank` @@ -34,7 +32,7 @@ dimensions. // Resize t_3d by assigning a tensor of different sizes, but same rank. t_3d = Tensor(3, 4, 3); -#### Constructor `Tensor(size_array)` +#### Constructor Tensor(size_array) Constructor where the sizes for the constructor are specified as an array of values instead of an explicitly list of parameters. The array type to use is @@ -45,7 +43,7 @@ from an initializer list. Tensor t_2d({5, 7}); -### Class `TensorFixedSize>` +### Class TensorFixedSize> Class to use for tensors of fixed size, where the size is known at compile time. Fixed sized tensors can provide very fast computations because all their @@ -57,7 +55,7 @@ tensor data is held onto the stack and does not cause heap allocation and free. // Create a 4 x 3 tensor of floats. TensorFixedSize> t_4x3; -### Class `TensorMap>` +### Class TensorMap> This is the class to use to create a tensor on top of memory allocated and owned by another part of your code. It allows to view any piece of allocated @@ -67,7 +65,7 @@ data are stored. A TensorMap is not resizable because it does not own the memory where its data are stored. -#### Constructor `TensorMap>(data, size0, size1, ...)` +#### Constructor TensorMap>(data, size0, size1, ...) Constructor for a Tensor. The constructor must be passed a pointer to the storage for the data, and "rank" size attributes. The storage has to be @@ -87,13 +85,13 @@ large enough to hold all the data. TensorMap> t_12(t_4x3.data(), 12); -#### Class `TensorRef` +#### Class TensorRef See Assigning to a TensorRef below. ## Accessing Tensor Elements -#### ` tensor(index0, index1...)` +#### tensor(index0, index1...) Return the element at position `(index0, index1...)` in tensor `tensor`. You must pass as many parameters as the rank of `tensor`. @@ -278,7 +276,7 @@ Simiarly, assigning an expression to a TensorMap causes its evaluation. Like tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to have the rank and sizes of the expression that are assigned to them. -#### Calling `eval()`. +#### Calling eval(). When you compute large composite expressions, you sometimes want to tell Eigen that an intermediate value in the expression tree is worth evaluating ahead of @@ -355,7 +353,7 @@ call for the right hand side: (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval(); -#### Assigning to a `TensorRef`. +#### Assigning to a TensorRef. If you need to access only a few elements from the value of an expression you can avoid materializing the value in a full tensor by using a TensorRef. @@ -455,24 +453,24 @@ memory for tensors with cuda. In the documentation of the tensor methods and Operation we mention datatypes that are tensor-type specific: -#### `::``Dimensions` +#### ::Dimensions Acts like an array of ints. Has an `int size` attribute, and can be indexed like an array to access individual values. Used to represent the dimensions of a tensor. See `dimensions()`. -#### `::``Index` +#### ::Index Acts like an `int`. Used for indexing tensors along their dimensions. See `operator()`, `dimension()`, and `size()`. -#### `::``Scalar` +#### ::Scalar Represents the datatype of individual tensor elements. For example, for a `Tensor`, `Scalar` is the type `float`. See `setConstant()`. -#### `` +#### We use this pseudo type to indicate that a tensor Operation is returned by a method. We indicate in the text the type and dimensions of the tensor that the @@ -492,7 +490,7 @@ Tensor, TensorFixedSize, and TensorMap. ## Metadata -### `int NumDimensions` +### int NumDimensions Constant value indicating the number of dimensions of a Tensor. This is also known as the tensor "rank". @@ -501,7 +499,7 @@ known as the tensor "rank". cout << "Dims " << a.NumDimensions; => Dims 2 -### `Dimensions dimensions()` +### Dimensions dimensions() Returns an array-like object representing the dimensions of the tensor. The actual type of the `dimensions()` result is `::``Dimensions`. @@ -519,7 +517,7 @@ If you use a C++11 compiler, you can use `auto` to simplify the code: << ", dim 1: " << d[1]; => Dim size: 2, dim 0: 3, dim 1: 4 -### `Index dimension(Index n)` +### Index dimension(Index n) Returns the n-th dimension of the tensor. The actual type of the `dimension()` result is `::``Index`, but you can @@ -530,7 +528,7 @@ always use it like an int. cout << "Dim 1: " << dim1; => Dim 1: 4 -### `Index size()` +### Index size() Returns the total number of elements in the tensor. This is the product of all the tensor dimensions. The actual type of the `size()` result is @@ -605,7 +603,7 @@ You can use one of the methods below to initialize the tensor memory. These have an immediate effect on the tensor and return the tensor itself as a result. These are not tensor Operations which delay evaluation. -### ` setConstant(const Scalar& val)` +### setConstant(const Scalar& val) Sets all elements of the tensor to the constant value `val`. `Scalar` is the type of data stored in the tensor. You can pass any value that is @@ -633,7 +631,7 @@ has a copy constructor and an `operator=()`: yolo yolo yolo -### ` setZero()` +### setZero() Fills the tensor with zeros. Equivalent to `setConstant(Scalar(0))`. Returns the tensor itself in case you want to chain another call. @@ -647,7 +645,7 @@ Returns the tensor itself in case you want to chain another call. 0 0 0 0 -### ` setValues({..initializer_list})` +### setValues({..initializer_list}) Fills the tensor with explicit values specified in a std::initializer_list. The type of the initializer list depends on the type and rank of the tensor. @@ -683,7 +681,7 @@ code only sets the values of the first row of the tensor. 10 20 30 1000 1000 1000 -### ` setRandom()` +### setRandom() Fills the tensor with random values. Returns the tensor itself in case you want to chain another call. @@ -750,7 +748,7 @@ values of a tensor expression, the expression must either be evaluated or wrapped in a TensorRef. -### `Scalar* data()` and `const Scalar* data() const` +### Scalar* data() and const Scalar* data() const Returns a pointer to the storage for the tensor. The pointer is const if the tensor was const. This allows direct access to the data. The layout of the @@ -778,7 +776,7 @@ The chain of Operation is evaluated lazily, typically when it is assigned to a tensor. See "Controlling when Expression are Evaluated" for more details about their evaluation. -### ` constant(const Scalar& val)` +### constant(const Scalar& val) Returns a tensor of the same type and dimensions as the original tensor but where all elements have the value `val`. @@ -806,7 +804,7 @@ tensor, or multiply every element of a tensor by a scalar. 0.6 0.6 0.6 0.6 0.6 0.6 -### ` random()` +### random() Returns a tensor of the same type and dimensions as the current tensor but where all elements have random values. @@ -836,7 +834,7 @@ All these operations take a single input tensor as argument and return a tensor of the same type and dimensions as the tensor to which they are applied. The requested operations are applied to each element independently. -### ` operator-()` +### operator-() Returns a tensor of the same type and dimensions as the original tensor containing the opposite values of the original tensor. @@ -855,42 +853,42 @@ containing the opposite values of the original tensor. -1 -1 -1 -1 -1 -1 -### ` sqrt()` +### sqrt() Returns a tensor of the same type and dimensions as the original tensor containing the square roots of the original tensor. -### ` rsqrt()` +### rsqrt() Returns a tensor of the same type and dimensions as the original tensor containing the inverse square roots of the original tensor. -### ` square()` +### square() Returns a tensor of the same type and dimensions as the original tensor containing the squares of the original tensor values. -### ` inverse()` +### inverse() Returns a tensor of the same type and dimensions as the original tensor containing the inverse of the original tensor values. -### ` exp()` +### exp() Returns a tensor of the same type and dimensions as the original tensor containing the exponential of the original tensor. -### ` log()` +### log() Returns a tensor of the same type and dimensions as the original tensor containing the natural logarithms of the original tensor. -### ` abs()` +### abs() Returns a tensor of the same type and dimensions as the original tensor containing the absolute values of the original tensor. -### ` pow(Scalar exponent)` +### pow(Scalar exponent) Returns a tensor of the same type and dimensions as the original tensor containing the coefficients of the original tensor to the power of the @@ -917,17 +915,17 @@ cubic roots of an int Tensor: 0 1 2 3 4 5 -### ` operator * (Scalar scale)` +### operator * (Scalar scale) Multiplies all the coefficients of the input tensor by the provided scale. -### ` cwiseMax(Scalar threshold)` +### cwiseMax(Scalar threshold) TODO -### ` cwiseMin(Scalar threshold)` +### cwiseMin(Scalar threshold) TODO -### ` unaryExpr(const CustomUnaryOp& func)` +### unaryExpr(const CustomUnaryOp& func) TODO @@ -939,39 +937,39 @@ dimensions as the tensors to which they are applied, and unless otherwise specified it is also of the same type. The requested operations are applied to each pair of elements independently. -### ` operator+(const OtherDerived& other)` +### operator+(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise sums of the inputs. -### ` operator-(const OtherDerived& other)` +### operator-(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise differences of the inputs. -### ` operator*(const OtherDerived& other)` +### operator*(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise products of the inputs. -### ` operator/(const OtherDerived& other)` +### operator/(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise quotients of the inputs. This operator is not supported for integer types. -### ` cwiseMax(const OtherDerived& other)` +### cwiseMax(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise maximums of the inputs. -### ` cwiseMin(const OtherDerived& other)` +### cwiseMin(const OtherDerived& other) Returns a tensor of the same type and dimensions as the input tensors containing the coefficient wise mimimums of the inputs. -### ` Logical operators` +### Logical operators The following logical operators are supported as well: @@ -1129,50 +1127,50 @@ scalar, represented as a zero-dimension tensor. 276 -### ` sum(const Dimensions& new_dims)` -### ` sum()` +### sum(const Dimensions& new_dims) +### sum() Reduce a tensor using the sum() operator. The resulting values are the sum of the reduced values. -### ` mean(const Dimensions& new_dims)` -### ` mean()` +### mean(const Dimensions& new_dims) +### mean() Reduce a tensor using the mean() operator. The resulting values are the mean of the reduced values. -### ` maximum(const Dimensions& new_dims)` -### ` maximum()` +### maximum(const Dimensions& new_dims) +### maximum() Reduce a tensor using the maximum() operator. The resulting values are the largest of the reduced values. -### ` minimum(const Dimensions& new_dims)` -### ` minimum()` +### minimum(const Dimensions& new_dims) +### minimum() Reduce a tensor using the minimum() operator. The resulting values are the smallest of the reduced values. -### ` prod(const Dimensions& new_dims)` -### ` prod()` +### prod(const Dimensions& new_dims) +### prod() Reduce a tensor using the prod() operator. The resulting values are the product of the reduced values. -### ` all(const Dimensions& new_dims)` -### ` all()` +### all(const Dimensions& new_dims) +### all() Reduce a tensor using the all() operator. Casts tensor to bool and then checks whether all elements are true. Runs through all elements rather than short-circuiting, so may be significantly inefficient. -### ` any(const Dimensions& new_dims)` -### ` any()` +### any(const Dimensions& new_dims) +### any() Reduce a tensor using the any() operator. Casts tensor to bool and then checks whether any element is true. Runs through all elements rather than short-circuiting, so may be significantly inefficient. -### ` reduce(const Dimensions& new_dims, const Reducer& reducer)` +### reduce(const Dimensions& new_dims, const Reducer& reducer) Reduce a tensor using a user-defined reduction operator. See `SumReducer` in TensorFunctors.h for information on how to implement a reduction operator. @@ -1208,8 +1206,8 @@ Example: Trace along 2 dimensions. 15 -### ` trace(const Dimensions& new_dims)` -### ` trace()` +### trace(const Dimensions& new_dims) +### trace() As a special case, if no parameter is passed to the operation, trace is computed along *all* dimensions of the input tensor. @@ -1259,18 +1257,18 @@ dd a comment to this line 1 3 6 4 9 15 -### ` cumsum(const Index& axis)` +### cumsum(const Index& axis) Perform a scan by summing consecutive entries. -### ` cumprod(const Index& axis)` +### cumprod(const Index& axis) Perform a scan by multiplying consecutive entries. ## Convolutions -### ` convolve(const Kernel& kernel, const Dimensions& dims)` +### convolve(const Kernel& kernel, const Dimensions& dims) Returns a tensor that is the output of the convolution of the input tensor with the kernel, along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor @@ -1313,7 +1311,7 @@ These operations return a Tensor with different dimensions than the original Tensor. They can be used to access slices of tensors, see them with different dimensions, or pad tensors with additional data. -### ` reshape(const Dimensions& new_dims)` +### reshape(const Dimensions& new_dims) Returns a view of the input tensor that has been reshaped to the specified new dimensions. The argument new_dims is an array of Index values. The @@ -1392,7 +1390,7 @@ Note that "b" itself was not reshaped but that instead the assignment is done to the reshape view of b. -### ` shuffle(const Shuffle& shuffle)` +### shuffle(const Shuffle& shuffle) Returns a copy of the input tensor whose dimensions have been reordered according to the specified permutation. The argument shuffle @@ -1433,7 +1431,7 @@ Let's rewrite the previous example to take advantage of this feature: output.shuffle({2, 0, 1}) = input; -### ` stride(const Strides& strides)` +### stride(const Strides& strides) Returns a view of the input tensor that strides (skips stride-1 elements) along each of the dimensions. The argument strides is an @@ -1459,7 +1457,7 @@ It is possible to assign a tensor to a stride: output.stride({2, 3, 4}) = input; -### ` slice(const StartIndices& offsets, const Sizes& extents)` +### slice(const StartIndices& offsets, const Sizes& extents) Returns a sub-tensor of the given tensor. For each dimension i, the slice is made of the coefficients stored between offset[i] and offset[i] + extents[i] in @@ -1485,7 +1483,7 @@ the input tensor. 600 700 -### ` chip(const Index offset, const Index dim)` +### chip(const Index offset, const Index dim) A chip is a special kind of slice. It is the subtensor at the given offset in the dimension dim. The returned tensor has one fewer dimension than the input @@ -1536,7 +1534,7 @@ lvalue. For example: 0 0 0 -### ` reverse(const ReverseDimensions& reverse)` +### reverse(const ReverseDimensions& reverse) Returns a view of the input tensor that reverses the order of the coefficients along a subset of the dimensions. The argument reverse is an array of boolean @@ -1566,7 +1564,7 @@ of a 2D tensor: 0 100 200 -### ` broadcast(const Broadcast& broadcast)` +### broadcast(const Broadcast& broadcast) Returns a view of the input tensor in which the input is replicated one to many times. @@ -1590,11 +1588,11 @@ made in each of the dimensions. 0 100 200 0 100 200 300 400 500 300 400 500 -### ` concatenate(const OtherDerived& other, Axis axis)` +### concatenate(const OtherDerived& other, Axis axis) TODO -### ` pad(const PaddingDimensions& padding)` +### pad(const PaddingDimensions& padding) Returns a view of the input tensor in which the input is padded with zeros. @@ -1619,7 +1617,7 @@ Returns a view of the input tensor in which the input is padded with zeros. 0 0 0 0 -### ` extract_patches(const PatchDims& patch_dims)` +### extract_patches(const PatchDims& patch_dims) Returns a tensor of coefficient patches extracted from the input tensor, where each patch is of dimension specified by 'patch_dims'. The returned tensor has @@ -1706,7 +1704,7 @@ This code results in the following output when the data layout is RowMajor: 6 7 10 11 -### ` extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)` +### extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type) Returns a tensor of coefficient image patches extracted from the input tensor, which is expected to have dimensions ordered as follows (depending on the data @@ -1763,7 +1761,7 @@ sizes: ## Special Operations -### ` cast()` +### cast() Returns a tensor of type T with the same dimensions as the original tensor. The returned tensor contains the values of the original tensor converted to @@ -1792,7 +1790,7 @@ but you can easily cast the tensors to floats to do the division: 1 2 2 -### ` eval()` +### eval() TODO diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h index 91a6f8d6c32ef763f7232ab681f501e3a0c9a7c3..8b8fb923523d6c42df61f5b9ce7e6fc877fd4094 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h @@ -99,18 +99,18 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -240,7 +240,7 @@ struct TensorEvaluator, Devi typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_orig_impl(op.expression(), device), m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device), m_return_dim(op.return_dim()) @@ -263,11 +263,11 @@ struct TensorEvaluator, Devi return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h index 72f072cf2e4df90f49920908c845c11f665e5d99..e5811d63fa63b7fd98ef94bbcfd4f0335040e87e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h @@ -104,14 +104,14 @@ struct TensorEvaluator, Device> static const int NumDims = XprType::NumDims; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess | - TensorEvaluator::PreferBlockAccess, + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), Layout = TensorEvaluator::Layout, RawAccess = TensorEvaluator::RawAccess }; @@ -124,7 +124,7 @@ struct TensorEvaluator, Device> RightTensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + TensorEvaluator(const XprType& op, const Device& device) : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) { @@ -142,7 +142,7 @@ struct TensorEvaluator, Device> return m_rightImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions())); m_leftImpl.evalSubExprsIfNeeded(NULL); // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non @@ -154,7 +154,7 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { m_rightImpl.evalSubExprsIfNeededAsync( @@ -163,7 +163,7 @@ struct TensorEvaluator, Device> } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h index 35b6458e5a813518d0ba0bcc7d3949d931914b2d..fcc7411afcaedacecc26f875bbd55db3d87bc9f5 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h @@ -513,34 +513,34 @@ class TensorBase // Comparisons and tests. template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<(const OtherDerived& other) const { + operator<(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator<=(const OtherDerived& other) const { + operator<=(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>(const OtherDerived& other) const { + operator>(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator>=(const OtherDerived& other) const { + operator>=(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator==(const OtherDerived& other) const { + operator==(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseBinaryOp, const Derived, const OtherDerived> - operator!=(const OtherDerived& other) const { + operator!=(const TensorBase& other) const { return binaryExpr(other.derived(), internal::scalar_cmp_op()); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h index 1e55d12c42fc2ece4035b743915eef10d996ea0a..243b3fb7bd68bc1896ef92da191b24b47460f6fa 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h @@ -242,7 +242,7 @@ class TensorBlockDescriptor { const DestinationBufferKind& kind() const { return m_kind; } private: - friend class TensorBlockDescriptor; + friend class TensorBlockDescriptor; DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} @@ -706,7 +706,7 @@ class TensorMaterializedBlock { } private: - friend class TensorMaterializedBlock; + friend class TensorMaterializedBlock; Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h index fc75c8d9a6fcd1d5835b8e1668b021d97387460a..7449b046b2936bab7891a9299759fe800016eeb1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h @@ -127,7 +127,7 @@ struct TensorEvaluator, Device> typedef DSizes BroadcastDimensions; //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// - typedef internal::TensorBlockDescriptor TensorBlockDesc; + typedef internal::TensorBlockDescriptor TensorBlockDesc; typedef internal::TensorBlockScratchAllocator TensorBlockScratch; typedef typename TensorEvaluator::TensorBlock @@ -138,14 +138,13 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : isCopy(false), nByOne(false), oneByN(false), m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device) { // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar - // and store the result in a scalar. Instead one should reshape the scalar into a a N-D + // and store the result in a scalar. Instead one should reshape the scalar into a N-D // tensor with N >= 1 of 1 element first and then broadcast. EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE); const InputDimensions& input_dims = m_impl.dimensions(); @@ -211,20 +210,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -411,25 +410,24 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const { + // Consider the flattened tensor [v0, ..., vN], + // Concatenates m_broadcast[dim] copies, + // [v0, ..., vN, v0, ..., vN, ... ] + // with dim == NumDims - 1 for col-major, dim == 0 for row-major. EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - Index dim, inputIndex; - - if (static_cast(Layout) == static_cast(ColMajor)) { - dim = NumDims - 1; - } else { - dim = 0; - } - - inputIndex = index % m_inputStrides[dim]; - if (inputIndex + PacketSize <= m_inputStrides[dim]) { + // Size of flattened tensor. + const Index M = (static_cast(Layout) == static_cast(ColMajor)) ? + m_inputStrides[NumDims - 1] : m_inputStrides[0]; + Index inputIndex = index % M; + if (inputIndex + PacketSize <= M) { return m_impl.template packet(inputIndex); } else { EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; EIGEN_UNROLL_LOOP for (int i = 0; i < PacketSize; ++i) { - if (inputIndex > m_inputStrides[dim]-1) { + if (inputIndex > M - 1) { inputIndex = 0; } values[i] = m_impl.coeff(inputIndex++); @@ -441,32 +439,30 @@ struct TensorEvaluator, Device> template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const { + // Consider the flattened tensor [v0, ..., vN], + // Interleaves m_broadcast[dim] copies, + // [v0, v0, ..., v1, v1, ..., vN, vN, ... ] + // with dim == 0 for col-major, dim == NumDims - 1 for row-major. EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) - eigen_assert(index+PacketSize-1 < dimensions().TotalSize()); - - EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; - Index dim, inputIndex, outputOffset; + eigen_assert(index + PacketSize-1 < dimensions().TotalSize()); - if (static_cast(Layout) == static_cast(ColMajor)) { - dim = 1; - } else { - dim = NumDims - 2; - } + const Index M = (static_cast(Layout) == static_cast(ColMajor)) ? + m_broadcast[0] : m_broadcast[NumDims - 1]; - inputIndex = index / m_outputStrides[dim]; - outputOffset = index % m_outputStrides[dim]; - if (outputOffset + PacketSize <= m_outputStrides[dim]) { - values[0] = m_impl.coeff(inputIndex); - return internal::pload1(values); + Index inputIndex = index / M; + Index outputOffset = index % M; + if (outputOffset + PacketSize <= M) { + return internal::pset1(m_impl.coeff(inputIndex)); } else { + EIGEN_ALIGN_MAX typename internal::remove_const::type values[PacketSize]; EIGEN_UNROLL_LOOP - for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) { - if (outputOffset + cur < m_outputStrides[dim]) { + for (int i = 0; i < PacketSize; ++i) { + if (outputOffset < M) { values[i] = m_impl.coeff(inputIndex); + ++outputOffset; } else { values[i] = m_impl.coeff(++inputIndex); - outputOffset = 0; - cur = 0; + outputOffset = 1; // Next offset. } } return internal::pload(values); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h index 7c6bbd180fba752e19db27b516be652d711d7ca2..376457341120f2224b652d796942c80a293ecfa9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h @@ -164,7 +164,7 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -200,12 +200,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -433,7 +433,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor TensorBlockDesc; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h index 0dfe21604290de306e0f618975cb5deb2cabcf7a..5235a8e6f93f9d0571ba1acbe7e7e28990b9e5d4 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h @@ -119,7 +119,7 @@ struct TensorEvaluator(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -172,14 +172,14 @@ struct TensorEvaluator(Layout) == static_cast(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h index 424caced1a2a6eefe0bb6bca23304bdfb8518f4b..fa36da195f38fcc8621e55246734dc2bc8c6ddf6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h @@ -89,7 +89,6 @@ struct TensorContractionBlockMemAllocator { eigen_assert(rhs_block); BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn); char* block_mem = static_cast(d.allocate(sz.lhs_size + sz.rhs_size)); - eigen_assert(block_mem); *lhs_block = reinterpret_cast(block_mem); *rhs_block = reinterpret_cast(block_mem + sz.lhs_size); return block_mem; @@ -417,7 +416,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator typedef DSizes Dimensions; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device) : m_leftImpl(choose(Cond(Layout) == static_cast(ColMajor)>(), op.lhsExpression(), op.rhsExpression()), device), @@ -602,7 +601,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_leftImpl.evalSubExprsIfNeeded(NULL); m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { @@ -617,7 +616,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType dest, EvalSubExprsCallback done) { m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) { @@ -633,6 +632,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator } #endif // EIGEN_USE_THREADS +#ifndef TENSOR_CONTRACTION_DISPATCH #define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \ if (this->m_lhs_inner_dim_contiguous) { \ if (this->m_rhs_inner_dim_contiguous) { \ @@ -663,7 +663,9 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator } \ } \ } +#endif +#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH #define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \ if (this->m_lhs_inner_dim_contiguous) { \ if (this->m_rhs_inner_dim_contiguous) { \ @@ -694,6 +696,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator } \ } \ } +#endif EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const { static_cast(this)->template evalProduct(buffer); @@ -908,7 +911,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator kernel.deallocate(this->m_device, packed_mem); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_leftImpl.cleanup(); m_rightImpl.cleanup(); @@ -1005,7 +1008,7 @@ struct TensorEvaluator Dimensions; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : + TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h index bb990b3782c33b2eea4870876ccda8529e8ff848..bf9194d88b76c80ec29771a3a57e2aed7f030245 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h @@ -233,7 +233,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs, } \ } \ -#define writeRegToShmem(_) \ +#define writeRegToShmem() \ lhs_shmem[lhs_store_idx_0] = lhs_pf0; \ rhs_shmem[rhs_store_idx_0] = rhs_pf0; \ \ @@ -1270,7 +1270,7 @@ struct TensorEvaluator::value), @@ -1278,7 +1278,7 @@ struct TensorEvaluatorm_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); if (data) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h index a6ca1777a160579f431099b782348f2fcdb90259..473c228490f238f1a6c1bb6f0d31b6ae8f93b557 100755 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h @@ -1340,10 +1340,10 @@ struct TensorEvaluatorm_leftImpl.evalSubExprsIfNeeded(NULL); this->m_rightImpl.evalSubExprsIfNeeded(NULL); if (!data) { @@ -1630,7 +1630,7 @@ struct TensorEvaluatorm_leftImpl.cleanup(); this->m_rightImpl.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h index 44493906d0d81bd011a7663f77bf87047d3ce963..09d2da9a8d2ed240aa1c9bc9ba45bb54d2d7fabd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h @@ -195,14 +195,14 @@ class TensorConversionOp : public TensorBase struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) { impl.evalSubExprsIfNeeded(NULL); return true; } }; template struct ConversionSubExprEval { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { + static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); } }; @@ -211,8 +211,7 @@ template struct ConversionSubExprEval< template struct ConversionSubExprEvalAsync { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run( - Eval& impl, EvalPointerType, EvalSubExprsCallback done) { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) { impl.evalSubExprsIfNeededAsync(nullptr, std::move(done)); } }; @@ -221,8 +220,7 @@ template struct ConversionSubExprEvalAsync { - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run( - Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { + static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) { impl.evalSubExprsIfNeededAsync(data, std::move(done)); } }; @@ -363,21 +361,21 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return ConversionSubExprEval, EvaluatorPointerType>::run(m_impl, data); } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType data, EvalSubExprsCallback done) { ConversionSubExprEvalAsync, EvaluatorPointerType, @@ -385,7 +383,7 @@ struct TensorEvaluator, Device> } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -404,8 +402,8 @@ struct TensorEvaluator, Device> const bool Vectorizable = IsSameType ? TensorEvaluator::PacketAccess - : TensorEvaluator::PacketAccess & - internal::type_casting_traits::VectorizedCast; + : int(TensorEvaluator::PacketAccess) & + int(internal::type_casting_traits::VectorizedCast); return internal::PacketConv::run(m_impl, index); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index df289e2c0ad90e7e063a431065141708532e60f4..b20f80ba2a07166413e4e05d90c9995086dd34f1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -307,8 +307,8 @@ struct TensorEvaluator::IsAligned & TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & TensorEvaluator::PacketAccess, + IsAligned = int(TensorEvaluator::IsAligned) & int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & int(TensorEvaluator::PacketAccess), BlockAccess = false, PreferBlockAccess = false, Layout = TensorEvaluator::Layout, @@ -320,7 +320,7 @@ struct TensorEvaluator(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -384,12 +384,12 @@ struct TensorEvaluator(TensorEvaluator::Layout) == static_cast(TensorEvaluator::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h index 92003c76653130fcfbd015eceeb161570401cdc5..033318fdcc86e1e492f1c3277147f9de919e681e 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h @@ -305,7 +305,7 @@ struct TensorEvaluator, Devi typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device) : m_op(op), m_device(device), m_result(NULL) { m_dimensions = op.func().dimensions(op.expression()); @@ -114,7 +114,7 @@ struct TensorEvaluator, Devi EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (data) { evalTo(data); return false; @@ -126,7 +126,7 @@ struct TensorEvaluator, Devi } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_result) { m_device.deallocate_temp(m_result); m_result = NULL; @@ -157,7 +157,7 @@ struct TensorEvaluator, Devi #endif protected: - EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) { + void evalTo(EvaluatorPointerType data) { TensorMap > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.expression(), result, m_device); } @@ -279,7 +279,7 @@ struct TensorEvaluator > result(m_device.get(data), m_dimensions); m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h index 9422dcd7af7820b6012ca2a4bd65b92d2828d7ee..ec2e3cb143aaaea2b2ef996109cb8cdf25f902c1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h @@ -42,51 +42,84 @@ class StreamInterface { virtual unsigned int* semaphore() const = 0; }; -static gpuDeviceProp_t* m_deviceProperties; -static bool m_devicePropInitialized = false; - -static void initializeDeviceProp() { - if (!m_devicePropInitialized) { - // Attempts to ensure proper behavior in the case of multiple threads - // calling this function simultaneously. This would be trivial to - // implement if we could use std::mutex, but unfortunately mutex don't - // compile with nvcc, so we resort to atomics and thread fences instead. - // Note that if the caller uses a compiler that doesn't support c++11 we - // can't ensure that the initialization is thread safe. - static std::atomic first(true); - if (first.exchange(false)) { - // We're the first thread to reach this point. - int num_devices; - gpuError_t status = gpuGetDeviceCount(&num_devices); - if (status != gpuSuccess) { - std::cerr << "Failed to get the number of GPU devices: " - << gpuGetErrorString(status) - << std::endl; - gpu_assert(status == gpuSuccess); - } - m_deviceProperties = new gpuDeviceProp_t[num_devices]; - for (int i = 0; i < num_devices; ++i) { - status = gpuGetDeviceProperties(&m_deviceProperties[i], i); +class GpuDeviceProperties { + public: + GpuDeviceProperties() : + initialized_(false), first_(true), device_properties_(nullptr) {} + + ~GpuDeviceProperties() { + if (device_properties_) { + delete[] device_properties_; + } + } + + EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { + return device_properties_[device]; + } + + EIGEN_STRONG_INLINE bool isInitialized() const { + return initialized_; + } + + void initialize() { + if (!initialized_) { + // Attempts to ensure proper behavior in the case of multiple threads + // calling this function simultaneously. This would be trivial to + // implement if we could use std::mutex, but unfortunately mutex don't + // compile with nvcc, so we resort to atomics and thread fences instead. + // Note that if the caller uses a compiler that doesn't support c++11 we + // can't ensure that the initialization is thread safe. + if (first_.exchange(false)) { + // We're the first thread to reach this point. + int num_devices; + gpuError_t status = gpuGetDeviceCount(&num_devices); if (status != gpuSuccess) { - std::cerr << "Failed to initialize GPU device #" - << i - << ": " + std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl; gpu_assert(status == gpuSuccess); } - } + device_properties_ = new gpuDeviceProp_t[num_devices]; + for (int i = 0; i < num_devices; ++i) { + status = gpuGetDeviceProperties(&device_properties_[i], i); + if (status != gpuSuccess) { + std::cerr << "Failed to initialize GPU device #" + << i + << ": " + << gpuGetErrorString(status) + << std::endl; + gpu_assert(status == gpuSuccess); + } + } - std::atomic_thread_fence(std::memory_order_release); - m_devicePropInitialized = true; - } else { - // Wait for the other thread to inititialize the properties. - while (!m_devicePropInitialized) { - std::atomic_thread_fence(std::memory_order_acquire); - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + std::atomic_thread_fence(std::memory_order_release); + initialized_ = true; + } else { + // Wait for the other thread to inititialize the properties. + while (!initialized_) { + std::atomic_thread_fence(std::memory_order_acquire); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } } } } + + private: + volatile bool initialized_; + std::atomic first_; + gpuDeviceProp_t* device_properties_; +}; + +EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() { + static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties(); + if (!deviceProperties->isInitialized()) { + deviceProperties->initialize(); + } + return *deviceProperties; +} + +EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) { + return GetGpuDeviceProperties().get(device); } static const gpuStream_t default_stream = gpuStreamDefault; @@ -96,12 +129,9 @@ class GpuStreamDevice : public StreamInterface { // Use the default stream on the current device GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) { gpuGetDevice(&device_); - initializeDeviceProp(); } // Use the default stream on the specified device - GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) { - initializeDeviceProp(); - } + GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {} // Use the specified stream. Note that it's the // caller responsibility to ensure that the stream can run on // the specified device. If no device is specified the code @@ -118,7 +148,6 @@ class GpuStreamDevice : public StreamInterface { gpu_assert(device < num_devices); device_ = device; } - initializeDeviceProp(); } virtual ~GpuStreamDevice() { @@ -129,7 +158,7 @@ class GpuStreamDevice : public StreamInterface { const gpuStream_t& stream() const { return *stream_; } const gpuDeviceProp_t& deviceProperties() const { - return m_deviceProperties[device_]; + return GetGpuDeviceProperties(device_); } virtual void* allocate(size_t num_bytes) const { gpuError_t err = gpuSetDevice(device_); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h index 4689b0230911f6e4bdabe56ecfa1d48435cd7ec4..a48d035f5aad48f0560e613b8d361f25d1075e30 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h @@ -131,17 +131,17 @@ struct TensorEvaluator, Device> TensorBlockAssignment; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){} - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() { + EIGEN_STRONG_INLINE ~TensorEvaluator() { } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); return m_impl.evalSubExprsIfNeeded(m_buffer); @@ -149,7 +149,7 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType scalar, EvalSubExprsCallback done) { EIGEN_UNUSED_VARIABLE(scalar); eigen_assert(scalar == NULL); @@ -191,7 +191,7 @@ struct TensorEvaluator, Device> block.cleanup(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h index d4532b72c82d6771bf2f3c034076582bb60196c0..6ac575ef59fd9adb39f4b4dec9d5e5cd2d183b34 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h @@ -63,7 +63,7 @@ struct TensorEvaluator TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) : m_data(device.get((const_cast(m.data())))), m_dims(m.dimensions()), m_device(device) @@ -72,7 +72,7 @@ struct TensorEvaluator EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) { if (!NumTraits::type>::RequireInitialization && dest) { m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; @@ -82,14 +82,14 @@ struct TensorEvaluator #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType dest, EvalSubExprsCallback done) { // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation. done(evalSubExprsIfNeeded(dest)); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {} + EIGEN_STRONG_INLINE void cleanup() {} EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data != NULL); @@ -192,7 +192,7 @@ struct TensorEvaluator const Device EIGEN_DEVICE_REF m_device; }; -namespace { +namespace internal { template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) { return *address; @@ -219,8 +219,7 @@ T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess &address return *address; } #endif -} - +} // namespace internal // Default evaluator for rvalues template @@ -262,13 +261,13 @@ struct TensorEvaluator TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device) : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { if (!NumTraits::type>::RequireInitialization && data) { m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar)); return false; @@ -278,18 +277,18 @@ struct TensorEvaluator #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType dest, EvalSubExprsCallback done) { // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation. done(evalSubExprsIfNeeded(dest)); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { eigen_assert(m_data != NULL); - return loadConstant(m_data+index); + return internal::loadConstant(m_data+index); } template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -314,7 +313,7 @@ struct TensorEvaluator eigen_assert(m_data != NULL); const Index index = (static_cast(Layout) == static_cast(ColMajor)) ? m_dims.IndexOfColMajor(coords) : m_dims.IndexOfRowMajor(coords); - return loadConstant(m_data+index); + return internal::loadConstant(m_data+index); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { @@ -357,7 +356,6 @@ struct TensorEvaluator, Device> { typedef TensorCwiseNullaryOp XprType; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() { } @@ -391,17 +389,17 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { done(true); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { @@ -446,8 +444,8 @@ struct TensorEvaluator, Device> enum { IsAligned = TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), BlockAccess = TensorEvaluator::BlockAccess, PreferBlockAccess = TensorEvaluator::PreferBlockAccess, Layout = TensorEvaluator::Layout, @@ -455,7 +453,7 @@ struct TensorEvaluator, Device> RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_device(device), m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) @@ -485,20 +483,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_argImpl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); } @@ -557,21 +555,21 @@ struct TensorEvaluator XprType; enum { - IsAligned = TensorEvaluator::IsAligned & - TensorEvaluator::IsAligned, - PacketAccess = TensorEvaluator::PacketAccess & - TensorEvaluator::PacketAccess & - internal::functor_traits::PacketAccess, - BlockAccess = TensorEvaluator::BlockAccess & - TensorEvaluator::BlockAccess, - PreferBlockAccess = TensorEvaluator::PreferBlockAccess | - TensorEvaluator::PreferBlockAccess, + IsAligned = int(TensorEvaluator::IsAligned) & + int(TensorEvaluator::IsAligned), + PacketAccess = int(TensorEvaluator::PacketAccess) & + int(TensorEvaluator::PacketAccess) & + int(internal::functor_traits::PacketAccess), + BlockAccess = int(TensorEvaluator::BlockAccess) & + int(TensorEvaluator::BlockAccess), + PreferBlockAccess = int(TensorEvaluator::PreferBlockAccess) | + int(TensorEvaluator::PreferBlockAccess), Layout = TensorEvaluator::Layout, CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_device(device), m_functor(op.functor()), m_leftImpl(op.lhsExpression(), device), @@ -613,7 +611,7 @@ struct TensorEvaluator - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { // TODO(ezhulenev): Evaluate two expression in parallel? m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) { @@ -631,7 +629,7 @@ struct TensorEvaluator RawAccess = false }; - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_condImpl(op.ifExpression(), device), m_thenImpl(op.thenExpression(), device), m_elseImpl(op.elseExpression(), device) @@ -886,7 +884,7 @@ struct TensorEvaluator return m_condImpl.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_condImpl.evalSubExprsIfNeeded(NULL); m_thenImpl.evalSubExprsIfNeeded(NULL); m_elseImpl.evalSubExprsIfNeeded(NULL); @@ -895,7 +893,7 @@ struct TensorEvaluator #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) { @@ -905,7 +903,7 @@ struct TensorEvaluator } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_condImpl.cleanup(); m_thenImpl.cleanup(); m_elseImpl.cleanup(); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h index c62bc5fa90e3c4bb133ad675bc6a54ece797c9a2..4a1a0687cabbb915d3987bedcfd94149e296005d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h @@ -144,7 +144,7 @@ struct TensorEvaluator, D typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) { const typename TensorEvaluator::Dimensions& input_dims = m_impl.dimensions(); for (int i = 0; i < NumDims; ++i) { eigen_assert(input_dims[i] > 0); @@ -169,7 +169,7 @@ struct TensorEvaluator, D return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (data) { evalToBuf(data); @@ -181,7 +181,7 @@ struct TensorEvaluator, D } } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_data) { m_device.deallocate(m_data); m_data = NULL; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h index ca39bb855217e287abeacbf0333c671441bb8046..c7c1cfc7228cc334558b81485820bf6ac714b7f0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h @@ -61,7 +61,7 @@ class TensorFixedSize : public TensorBase, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) + TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_op(op.expression()), m_device(device), m_buffer(NULL) { } EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); } - #if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC - #endif EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { const Index numValues = internal::array_prod(m_impl.dimensions()); m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType))); @@ -165,7 +162,7 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_THREADS template - EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { const Index numValues = internal::array_prod(m_impl.dimensions()); m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp( @@ -185,7 +182,7 @@ struct TensorEvaluator, Device> } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_device.deallocate_temp(m_buffer); m_buffer = NULL; } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h index fd8fa00fa6292be32273241155e035858a666c9f..d9630322435fdd2cd75cc4e67ccf97a44088404d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h @@ -365,12 +365,16 @@ struct reducer_traits { }; }; - -// Argmin/Argmax reducers +// Argmin/Argmax reducers. Returns the first occurrence if multiple locations +// contain the same min/max value. template struct ArgMaxTupleReducer { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const { - if (t.second > accum->second) { *accum = t; } + if (t.second < accum->second) { + return; + } else if (t.second > accum->second || accum->first > t.first ) { + *accum = t; + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { return T(0, NumTraits::lowest()); @@ -394,7 +398,11 @@ struct reducer_traits, Device> { template struct ArgMinTupleReducer { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const { - if (t.second < accum->second) { *accum = t; } + if (t.second > accum->second) { + return; + } else if (t.second < accum->second || accum->first > t.first) { + *accum = t; + } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const { return T(0, NumTraits::highest()); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h index b1ff1d8b1b6276d453096b66a7314f0dd1bfbb02..174bf06838c008215a312537a03f3c94870db836 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h @@ -111,7 +111,7 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_device(device), m_generator(op.generator()) { TensorEvaluator argImpl(op.expression(), device); @@ -136,10 +136,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h index db394bcbb6349ab229e8526f43e06d3d5931a550..1d142f2ee8790c87590ca235c7fecb5592366da1 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h @@ -10,6 +10,8 @@ #if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H) +#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + #undef gpuStream_t #undef gpuDeviceProp_t #undef gpuError_t @@ -35,6 +37,8 @@ #undef gpuDeviceSynchronize #undef gpuMemcpy +#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES + #undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H #endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h index 49d1004f3d82d80d597dbdca2ba6ba5ffd57b758..dd51850b7760f42713ad5a7ac3b856362d5ac5e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h @@ -242,7 +242,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device) : m_device(device), m_impl(op.expression(), device) { EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -389,20 +389,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -514,16 +514,16 @@ struct TensorEvaluator, Device> } #endif - Index rowPaddingTop() const { return m_rowPaddingTop; } - Index colPaddingLeft() const { return m_colPaddingLeft; } - Index outputRows() const { return m_outputRows; } - Index outputCols() const { return m_outputCols; } - Index userRowStride() const { return m_row_strides; } - Index userColStride() const { return m_col_strides; } - Index userInRowStride() const { return m_in_row_strides; } - Index userInColStride() const { return m_in_col_strides; } - Index rowInflateStride() const { return m_row_inflate_strides; } - Index colInflateStride() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h index 2d8c7b9033f22a48b3e7192c31216396fa270dd0..d3600eab35458acf586e7beecc1d66b87e25d744 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h @@ -246,7 +246,7 @@ struct tuple_coeff { template EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple& t) { - return ((i == Idx) & is_compile_time_constant::ValType>::value) || + return ((i == Idx) && is_compile_time_constant::ValType>::value) || tuple_coeff::value_known_statically(i, t); } @@ -468,7 +468,7 @@ struct index_statically_eq_impl { template struct index_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) == value); } }; @@ -476,7 +476,7 @@ struct index_statically_eq_impl > { template struct index_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) == value); } }; @@ -492,7 +492,7 @@ struct index_statically_ne_impl { template struct index_statically_ne_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) != value); } }; @@ -500,7 +500,7 @@ struct index_statically_ne_impl > { template struct index_statically_ne_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) != value); } }; @@ -516,7 +516,7 @@ struct index_statically_gt_impl { template struct index_statically_gt_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) > value); } }; @@ -524,7 +524,7 @@ struct index_statically_gt_impl > { template struct index_statically_gt_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) > value); } }; @@ -541,7 +541,7 @@ struct index_statically_lt_impl { template struct index_statically_lt_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) < value); } }; @@ -549,7 +549,7 @@ struct index_statically_lt_impl > { template struct index_statically_lt_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexList().value_known_statically(i) & + return IndexList().value_known_statically(i) && (IndexList().get(i) < value); } }; @@ -566,7 +566,7 @@ struct index_pair_first_statically_eq_impl { template struct index_pair_first_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList().value_known_statically(i) & + return IndexPairList().value_known_statically(i) && (IndexPairList().operator[](i).first == value); } }; @@ -574,7 +574,7 @@ struct index_pair_first_statically_eq_impl struct index_pair_first_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList().value_known_statically(i) & + return IndexPairList().value_known_statically(i) && (IndexPairList().operator[](i).first == value); } }; @@ -591,7 +591,7 @@ struct index_pair_second_statically_eq_impl { template struct index_pair_second_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList().value_known_statically(i) & + return IndexPairList().value_known_statically(i) && (IndexPairList().operator[](i).second == value); } }; @@ -599,7 +599,7 @@ struct index_pair_second_statically_eq_impl struct index_pair_second_statically_eq_impl > { EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) { - return IndexPairList().value_known_statically(i) & + return IndexPairList().value_known_statically(i) && (IndexPairList().operator[](i).second == value); } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h index 7dadec7fbe7ec11814cb38d71be80d1e28db1efe..c5cb61af53468fd38def4498dab4cab50ae07488 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h @@ -103,7 +103,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_strides(op.strides()) { m_dimensions = m_impl.dimensions(); @@ -137,11 +137,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h index 6d5cce4aa6fe65c7507309e3278da65a04ec93a1..74fdc4c3c7722b2ad5802b1ca62b4d2922a55e80 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h @@ -28,8 +28,6 @@ namespace Eigen { namespace internal { -namespace { - // Note: result is undefined if val == 0 template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE @@ -135,8 +133,6 @@ namespace { #endif } }; -} - template struct TensorIntDivisor { @@ -252,7 +248,7 @@ private: template -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor& divisor) { return divisor.divide(numerator); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h index f159db1b9612c6b9bebee89b42316b5e4745eea6..80106c1a0093dcfaf8b98f8e67ee4738c4c203da 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h @@ -113,7 +113,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { for(int i = 0; i < NumDims; ++i) { @@ -136,10 +136,10 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -191,7 +191,7 @@ template typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h index a6181d35e6834f5d1010e2450d5eb04c865a2e3a..75b919839e4a401a1e07fb53df2a2a0da12a1b5a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h @@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits { }; // For CUDA packet types when using a GpuDevice -#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) +#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE) typedef ulonglong2 Packet4h2; template<> @@ -261,10 +261,10 @@ template struct IndexPair { #ifdef EIGEN_HAS_SFINAE namespace internal { - template + template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE - array customIndices2Array(IndexType& idx, numeric_list) { - return { idx[Is]... }; + array customIndices2Array(IndexType& idx, numeric_list) { + return { static_cast(idx[First]), static_cast(idx[Is])... }; } template EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h index ef79c8567e2fb4d4e1360297ce2d0c17b647f244..57da2e18d7a443d3d665b2bfd555d929b899e32c 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h @@ -142,7 +142,7 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_dimensions(op.dimensions()) { // The total size of the reshaped tensor must be equal to the total size @@ -154,16 +154,16 @@ struct TensorEvaluator, Device> #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType data, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(data, std::move(done)); } #endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -255,7 +255,7 @@ template RawAccess = TensorEvaluator::RawAccess }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -369,8 +369,9 @@ class TensorSlicingOp : public TensorBase struct MemcpyTriggerForSlicing { EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { } EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const { @@ -400,7 +401,7 @@ template struct MemcpyTriggerForSlicing @@ -443,15 +444,10 @@ struct TensorEvaluator, Devi TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) { - for (Index i = 0; i < internal::array_size::value; ++i) { - eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); - } - m_is_identity = true; - bool degenerate = false; for (int i = 0; i < internal::array_size::value; ++i) { eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]); @@ -459,9 +455,6 @@ struct TensorEvaluator, Devi op.startIndices()[i] != 0) { m_is_identity = false; } - if (op.sizes()[i] == 0) { // we have an empty size - degenerate = true; - } } // No strides for scalars. @@ -479,8 +472,8 @@ struct TensorEvaluator, Devi m_outputStrides[0] = 1; for (int i = 1; i < NumDims; ++i) { m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); } + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } } else { m_inputStrides[NumDims-1] = 1; for (int i = NumDims - 2; i >= 0; --i) { @@ -491,14 +484,14 @@ struct TensorEvaluator, Devi m_outputStrides[NumDims-1] = 1; for (int i = NumDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); } + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); + } } } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { m_impl.evalSubExprsIfNeeded(NULL); if (!NumTraits::type>::RequireInitialization && data && m_impl.data()) { @@ -519,7 +512,7 @@ struct TensorEvaluator, Devi } } // Use memcpy if it's going to be faster than using the regular evaluation. - const MemcpyTriggerForSlicing trigger(m_device); + const internal::MemcpyTriggerForSlicing trigger(m_device); if (trigger(internal::array_prod(dimensions()), contiguous_values)) { EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data(); for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) { @@ -534,13 +527,13 @@ struct TensorEvaluator, Devi #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType /*data*/, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -738,7 +731,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockScratchAllocator TensorBlockScratch; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } @@ -906,7 +899,7 @@ struct TensorEvaluator::Dimensions InputDimensions; const InputDimensions& input_dims = m_impl.dimensions(); - // check for degenerate intervals and compute output tensor shape - bool degenerate = false; + // compute output tensor shape m_is_identity = true; for (int i = 0; i < NumDims; i++) { Index interval = stopIndicesClamped[i] - startIndicesClamped[i]; if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) { m_dimensions[i] = 0; - degenerate = true; } else { m_dimensions[i] = (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0); @@ -967,8 +958,7 @@ struct TensorEvaluator(degenerate ? 1 : m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } else { m_inputStrides[NumDims-1] = m_strides[NumDims-1]; @@ -983,8 +973,7 @@ struct TensorEvaluator= 0; --i) { m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1]; - // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash - m_fastOutputStrides[i] = internal::TensorIntDivisor(degenerate ? 1 : m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1); } } } @@ -992,12 +981,12 @@ struct TensorEvaluator, Device TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) { // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead @@ -151,20 +151,20 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h index 64a436e50740296e22b5592a7104c00ad983c0a9..413d25dd4da57c6ee7c313e13d7ea92a82bd168d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h @@ -107,7 +107,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { Index num_patches = 1; @@ -152,12 +152,12 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h index 13450e1a795ee35a65f38f627eee18bd62f81dbd..2bcb39a950a165cd5c499a9a32a8258908592aac 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h @@ -14,58 +14,20 @@ namespace Eigen { namespace internal { -namespace { - -EIGEN_DEVICE_FUNC uint64_t get_random_seed() { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() { #if defined(EIGEN_GPU_COMPILE_PHASE) // We don't support 3d kernels since we currently only use 1 and // 2d kernels. gpu_assert(threadIdx.z == 0); - return clock64() + - blockIdx.x * blockDim.x + threadIdx.x + - gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); - -#elif defined _WIN32 - // Use the current time as a baseline. - SYSTEMTIME st; - GetSystemTime(&st); - int time = st.wSecond + 1000 * st.wMilliseconds; - // Mix in a random number to make sure that we get different seeds if - // we try to generate seeds faster than the clock resolution. - // We need 2 random values since the generator only generate 16 bits at - // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx) - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ time; - return rnd; - -#elif defined __APPLE__ - // Same approach as for win32, except that the random number generator - // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random). - uint64_t rnd = ::random() ^ mach_absolute_time(); - return rnd; - -#elif defined __native_client__ - // Same approach as for win32, except using clock_gettime - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - int rnd1 = ::rand(); - int rnd2 = ::rand(); - uint64_t rnd = (rnd1 | rnd2 << 16) ^ ts.tv_nsec; - return rnd; - + return blockIdx.x * blockDim.x + threadIdx.x + + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y); #else - // Augment the current time with pseudo random number generation - // to ensure that we get different seeds if we try to generate seeds - // faster than the clock resolution. - timespec ts; - clock_gettime(CLOCK_REALTIME, &ts); - uint64_t rnd = ::random() ^ ts.tv_nsec; - return rnd; + // Rely on Eigen's random implementation. + return random(); #endif } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) { // TODO: Unify with the implementation in the non blocking thread pool. uint64_t current = *state; // Update the internal state @@ -74,14 +36,11 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint6 return static_cast((current ^ (current >> 22)) >> (22 + (current >> 61))); } -static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) { seed = seed ? seed : get_random_seed(); return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL; } -} // namespace - - template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) { unsigned rnd = PCG_XSH_RS_generator(state, stream); diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 0a65591e63e261ea19d797fc057b1390b95767a2..f1f4eaab730e91260da27b50309b34bc71b4a0c2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -166,8 +166,12 @@ struct GenericDimReducer<-1, Self, Op> { }; template + bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && + !Self::ReducerTraits::IsExactlyAssociative && + // GPU threads can quickly run out of stack space + // for moderately sized inputs. + !Self::RunningOnGPU + )> struct InnerMostDimReducer { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) { typename Self::CoeffReturnType accum = reducer.initialize(); @@ -528,6 +532,18 @@ struct TensorReductionEvaluatorBase::size; + + // For full reductions +#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) + static constexpr bool RunningOnGPU = internal::is_same::value; + static constexpr bool RunningOnSycl = false; +#elif defined(EIGEN_USE_SYCL) +static const bool RunningOnSycl = internal::is_same::type, Eigen::SyclDevice>::value; +static const bool RunningOnGPU = false; +#else + static constexpr bool RunningOnGPU = false; + static constexpr bool RunningOnSycl = false; +#endif enum { IsAligned = false, @@ -549,7 +565,7 @@ struct TensorReductionEvaluatorBase::value; static const bool RunningFullReduction = (NumOutputDims==0); - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) { EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -578,7 +594,7 @@ struct TensorReductionEvaluatorBase(m_outputStrides[i]); } } else { - m_outputStrides[NumOutputDims - 1] = 1; + m_outputStrides[static_cast(NumOutputDims - 1)] = 1; for (int i = NumOutputDims - 2; i >= 0; --i) { m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); @@ -631,13 +647,6 @@ struct TensorReductionEvaluatorBase EIGEN_STRONG_INLINE -#if !defined(EIGEN_HIPCC) - EIGEN_DEVICE_FUNC -#endif void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) { @@ -759,19 +765,12 @@ struct TensorReductionEvaluatorBase::value; - static const bool RunningOnSycl = false; -#elif defined(EIGEN_USE_SYCL) -static const bool RunningOnSycl = internal::is_same::type, Eigen::SyclDevice>::value; -static const bool RunningOnGPU = false; -#else - static const bool RunningOnGPU = false; - static const bool RunningOnSycl = false; -#endif EvaluatorPointerType m_result; const Device EIGEN_DEVICE_REF m_device; @@ -987,7 +975,7 @@ template class Ma struct TensorEvaluator, Device> : public TensorReductionEvaluatorBase, Device> { typedef TensorReductionEvaluatorBase, Device> Base; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){} }; @@ -996,7 +984,7 @@ struct TensorEvaluator, : public TensorReductionEvaluatorBase, Eigen::SyclDevice> { typedef TensorReductionEvaluatorBase, Eigen::SyclDevice> Base; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} + EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){} // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel //Therefore the coeff function should be overridden by for SYCL kernel EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index db4e8d866f04ce0586835da2595a171856ae96dd..315ccc172e9aded42fdf4b031054bb47f90ee9e7 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -98,6 +98,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) { } } } +#ifdef EIGEN_GPU_COMPILE_PHASE // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations template __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) { @@ -107,6 +108,7 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc atomicReduce(houtput+i,*(haccum+i),reducer); } } +#endif // EIGEN_GPU_COMPILE_PHASE #endif // EIGEN_HAS_GPU_FP16 template <> @@ -213,8 +215,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer #ifdef EIGEN_HAS_GPU_FP16 template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, - packet_traits::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat( + Reducer reducer, const Self input, Index num_coeffs, half* scratch) { eigen_assert(blockDim.x == 1); eigen_assert(gridDim.x == 1); typedef packet_traits::type packet_type; @@ -224,15 +226,16 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFlo half2* h2scratch = reinterpret_cast(scratch); for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) { *h2scratch = - __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1)); + __halves2half2(input.coeff(i), input.coeff(i + 1)); h2scratch++; } if ((num_coeffs & 1) != 0) { - half lastCoeff = input.m_impl.coeff(num_coeffs - 1); + half lastCoeff = input.coeff(num_coeffs - 1); *h2scratch = __halves2half2(lastCoeff, reducer.initialize()); } } else { - *scratch = reducer.template initializePacket(); + packet_type reduce = reducer.template initializePacket(); + internal::pstoreu(scratch, reduce); } } @@ -258,8 +261,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reduce template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, - half* output, packet_traits::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat( + Reducer reducer, const Self input, Index num_coeffs, + half* output, half* scratch) { typedef typename packet_traits::type PacketType; const int packet_width = unpacket_traits::size; eigen_assert(NumPerThread % packet_width == 0); @@ -273,19 +277,20 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce int rem = num_coeffs % packet_width; if (rem != 0) { half2* p_scratch = reinterpret_cast(scratch); - *scratch = reducer.template initializePacket(); + pstoreu(scratch, reducer.template initializePacket()); for (int i = 0; i < rem / 2; i++) { *p_scratch = __halves2half2( - input.m_impl.coeff(num_coeffs - packet_width + 2 * i), - input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1)); + input.coeff(num_coeffs - packet_width + 2 * i), + input.coeff(num_coeffs - packet_width + 2 * i + 1)); p_scratch++; } if ((num_coeffs & 1) != 0) { - half last = input.m_impl.coeff(num_coeffs - 1); + half last = input.coeff(num_coeffs - 1); *p_scratch = __halves2half2(last, reducer.initialize()); } } else { - *scratch = reducer.template initializePacket(); + PacketType reduce = reducer.template initializePacket(); + pstoreu(scratch, reduce); } } __syncthreads(); @@ -298,7 +303,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce for (Index i = 0; i < max_iter; i += BlockSize) { const Index index = first_index + packet_width * i; eigen_assert(index + packet_width < num_coeffs); - PacketType val = input.m_impl.template packet(index); + PacketType val = input.template packet(index); reducer.reducePacket(val, &accum); } @@ -337,7 +342,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce } if ((threadIdx.x & (warpSize - 1)) == 0) { - atomicReduce(scratch, accum, reducer); + atomicReduce(reinterpret_cast(scratch), accum, reducer); } __syncthreads(); @@ -357,17 +362,21 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce } template -__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) { eigen_assert(threadIdx.x == 1); - half2* pscratch = reinterpret_cast(scratch); - half tmp = __float2half(0.f); typedef packet_traits::type packet_type; - for (int i = 0; i < unpacket_traits::size; i += 2) { - reducer.reduce(__low2half(*pscratch), &tmp); - reducer.reduce(__high2half(*pscratch), &tmp); - pscratch++; + if (unpacket_traits::size == 1) { + *output = *scratch; + } else { + half2* pscratch = reinterpret_cast(scratch); + half tmp = __float2half(0.f); + for (int i = 0; i < unpacket_traits::size; i += 2) { + reducer.reduce(__low2half(*pscratch), &tmp); + reducer.reduce(__high2half(*pscratch), &tmp); + pscratch++; + } + *output = tmp; } - *output = tmp; } #endif // EIGEN_HAS_GPU_FP16 @@ -416,13 +425,11 @@ template struct FullReductionLauncher { static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) { typedef typename Self::Index Index; - typedef typename packet_traits::type PacketType; const int block_size = 256; const int num_per_thread = 128; const int num_blocks = divup(num_coeffs, block_size * num_per_thread); - PacketType* scratch = static_cast(device.scratchpad()); - // half2* scratch = static_cast(device.scratchpad()); + half* scratch = static_cast(device.scratchpad()); if (num_blocks > 1) { // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h index 030d19844353e65d83f53cf344ad9bc786919315..a27d3646de15aa5c7656973c57f1c99c717dda07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h @@ -388,17 +388,17 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) + EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef& m, const Device&) : m_ref(m) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { } + EIGEN_STRONG_INLINE void cleanup() { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_ref.coeff(index); @@ -439,7 +439,7 @@ struct TensorEvaluator, Device> : public TensorEvaluator& m, const Device& d) : Base(m, d) + EIGEN_STRONG_INLINE TensorEvaluator(TensorRef& m, const Device& d) : Base(m, d) { } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h index 3b1fca59b79c1a1f882dcf8f8ea17e60fc43dbef..586ce68ab0ae3363127f0b5118ad9dd938195916 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h @@ -121,8 +121,7 @@ struct TensorEvaluator, Device TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_reverse(op.reverse()), m_device(device) @@ -150,20 +149,20 @@ struct TensorEvaluator, Device EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -426,8 +425,7 @@ struct TensorEvaluator, Device> CoordAccess = false, // to be implemented RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} typedef typename XprType::Scalar Scalar; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index a06c4a9f31caa5ed7df61209ef22bcefd39bbdbe..beae854ddba66753048e97a4a0037ec56d12c73f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -402,8 +402,7 @@ struct TensorEvaluator, Device> { typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_device(device), m_exclusive(op.exclusive()), @@ -498,7 +497,7 @@ struct TensorEvaluator, Device> { return TensorOpCost(sizeof(CoeffReturnType), 0, 0); } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { if (m_output) { m_device.deallocate_temp(m_output); m_output = NULL; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h index e6fed3d0baee0db0cca670a9b489fdda57b23690..e5e5efdeecad146089fea2fdf3697c030adbdb07 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h @@ -118,8 +118,7 @@ struct TensorEvaluator, Device> TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, - const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_device(device), m_impl(op.expression(), device) { @@ -143,7 +142,8 @@ struct TensorEvaluator, Device> m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor( + m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); } } else { m_unshuffledInputStrides[NumDims - 1] = 1; @@ -152,7 +152,8 @@ struct TensorEvaluator, Device> m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; - m_fastOutputStrides[i] = internal::TensorIntDivisor(m_outputStrides[i]); + m_fastOutputStrides[i] = internal::TensorIntDivisor( + m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); } } @@ -163,20 +164,20 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } #ifdef EIGEN_USE_THREADS template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( + EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( EvaluatorPointerType, EvalSubExprsCallback done) { m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); } #endif // EIGEN_USE_THREADS - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -384,7 +385,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockDescriptor TensorBlockDesc; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h index 5ff0880e78ef3a48f30b85257176cbde3dea0935..66655c4777f84f3366d495f9beeda269f3346e98 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h @@ -55,17 +55,14 @@ class TensorStorage EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T *data() const { return m_data; } - static EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE const FixedDimensions& dimensions() - { - static const FixedDimensions* singleton_dimensions = new FixedDimensions(); - return *singleton_dimensions; - } + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return FixedDimensions(); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return Size; } }; + // pure dynamic template class TensorStorage, Options_> diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h index 64bf3f139492347512c0ff08b6b9128b6bb4ce63..2f62a668f4dac17452dc3f0cf6323bfad130e8ca 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h @@ -109,7 +109,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { m_dimensions = m_impl.dimensions(); @@ -142,11 +142,11 @@ struct TensorEvaluator, Device> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -277,7 +277,7 @@ struct TensorEvaluator, Device> RawAccess = false }; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) { } typedef typename XprType::Index Index; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h index 24d22c189cacbce7b96532231e5f4b935f36bb5b..bbd2ff33259635bcf3f46bda987dc5ec3cae302d 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h @@ -108,7 +108,7 @@ struct TensorEvaluator, Device> typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device), m_traceDim(1), m_device(device) { @@ -134,6 +134,7 @@ struct TensorEvaluator, Device> } } + EIGEN_ONLY_USED_FOR_DEBUG(num_distinct_reduce_dims); eigen_assert(num_distinct_reduce_dims == NumReducedDims); // Compute the dimensions of the result. @@ -211,12 +212,12 @@ struct TensorEvaluator, Device> return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h index 4f7fd340ee2492e6d0eb8525439b23a6d39583cd..a8a535a7fc7ac0b75b8238b4a7e87e4e776417a3 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h @@ -254,10 +254,10 @@ struct nested > // the SAME case. // When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0, // Pc=0. -typedef enum { +enum PaddingType { PADDING_VALID = 1, PADDING_SAME = 2 -} PaddingType; +}; } // end namespace Eigen diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h index d23f2e4c81eabb3e3a68e0fb35c49ac5f03c2a68..afbcba4a27f25b1640196fc72e29b2cda004c525 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h @@ -78,14 +78,14 @@ template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator == (const TensorUInt128& lhs, const TensorUInt128& rhs) { - return (lhs.high == rhs.high) & (lhs.low == rhs.low); + return (lhs.high == rhs.high) && (lhs.low == rhs.low); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator != (const TensorUInt128& lhs, const TensorUInt128& rhs) { - return (lhs.high != rhs.high) | (lhs.low != rhs.low); + return (lhs.high != rhs.high) || (lhs.low != rhs.low); } template diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h index 81bed57f389766f049bb5ae582ca097e361022a1..0beb9ff09b832f16006d8a57f3079f3aedb91599 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h @@ -194,7 +194,7 @@ struct TensorEvaluator, D typedef internal::TensorBlockNotImplemented TensorBlock; //===--------------------------------------------------------------------===// - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : + EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) { EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE); @@ -352,12 +352,12 @@ struct TensorEvaluator, D EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { + EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { m_impl.evalSubExprsIfNeeded(NULL); return true; } - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { + EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); } @@ -518,21 +518,21 @@ struct TensorEvaluator, D const TensorEvaluator& impl() const { return m_impl; } - Index planePaddingTop() const { return m_planePaddingTop; } - Index rowPaddingTop() const { return m_rowPaddingTop; } - Index colPaddingLeft() const { return m_colPaddingLeft; } - Index outputPlanes() const { return m_outputPlanes; } - Index outputRows() const { return m_outputRows; } - Index outputCols() const { return m_outputCols; } - Index userPlaneStride() const { return m_plane_strides; } - Index userRowStride() const { return m_row_strides; } - Index userColStride() const { return m_col_strides; } - Index userInPlaneStride() const { return m_in_plane_strides; } - Index userInRowStride() const { return m_in_row_strides; } - Index userInColStride() const { return m_in_col_strides; } - Index planeInflateStride() const { return m_plane_inflate_strides; } - Index rowInflateStride() const { return m_row_inflate_strides; } - Index colInflateStride() const { return m_col_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; } #ifdef EIGEN_USE_SYCL // binding placeholder accessors to a command group handler for SYCL diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h index 149ceaff01d99924b3fb6b410b524c66097e37ce..f662dee5bebfc96d34e41e184a512aa5fda1cb38 100644 --- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h +++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h @@ -81,7 +81,8 @@ template struct take<0, type_list> template<> struct take<0, type_list<>> { typedef type_list<> type; }; template struct take> : concat, typename take>::type> {}; -template struct take> { typedef numeric_list type; }; +// XXX The following breaks in gcc-11, and is invalid anyways. +// template struct take> { typedef numeric_list type; }; template struct take<0, numeric_list> { typedef numeric_list type; }; template struct take<0, numeric_list> { typedef numeric_list type; }; diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT index c8c311a60bef74004ac3daad1d1c661e55c70ec3..72cd52143a6e86dba18f7f8a7a4778a7652f5856 100644 --- a/unsupported/Eigen/FFT +++ b/unsupported/Eigen/FFT @@ -162,15 +162,16 @@ class FFT typedef typename impl_type::Scalar Scalar; typedef typename impl_type::Complex Complex; - enum Flag { - Default=0, // goof proof - Unscaled=1, - HalfSpectrum=2, - // SomeOtherSpeedOptimization=4 - Speedy=32767 - }; - - FFT( const impl_type & impl=impl_type() , Flag flags=Default ) :m_impl(impl),m_flag(flags) { } + typedef int Flag; + static const Flag Default = 0; + static const Flag Unscaled = 1; + static const Flag HalfSpectrum = 2; + static const Flag Speedy = 32767; + + FFT(const impl_type& impl = impl_type(), Flag flags = Default) : m_impl(impl), m_flag(flags) + { + eigen_assert((flags == Default || flags == Unscaled || flags == HalfSpectrum || flags == Speedy) && "invalid flags argument"); + } inline bool HasFlag(Flag f) const { return (m_flag & (int)f) == f;} diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h index 0ef159e30041882458e569eab56c42170c18873d..0f166e35f01f2ab4951404693366b191d0ebe71b 100755 --- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h +++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h @@ -26,11 +26,11 @@ void make_coherent(const A& a, const B&b) make_coherent_impl::run(a.const_cast_derived(), b.const_cast_derived()); } -template struct auto_diff_special_op; +template struct auto_diff_special_op; } // end namespace internal -template class AutoDiffScalar; +template class AutoDiffScalar; template inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) { @@ -38,16 +38,16 @@ inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType:: } /** \class AutoDiffScalar - * \brief A scalar type replacement with automatic differentation capability + * \brief A scalar type replacement with automatic differentiation capability * - * \param _DerType the vector type used to store/represent the derivatives. The base scalar type + * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type * as well as the number of derivatives to compute are determined from this type. * Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf * if the number of derivatives is not known at compile time, and/or, the number * of derivatives is large. - * Note that _DerType can also be a reference (e.g., \c VectorXf&) to wrap a + * Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a * existing vector into an AutoDiffScalar. - * Finally, _DerType can also be any Eigen compatible expression. + * Finally, DerivativeType can also be any Eigen compatible expression. * * This class represents a scalar value while tracking its respective derivatives using Eigen's expression * template mechanism. @@ -63,17 +63,17 @@ inline AutoDiffScalar MakeAutoDiffScalar(const typename NewDerType:: * */ -template +template class AutoDiffScalar : public internal::auto_diff_special_op - <_DerType, !internal::is_same::type>::Scalar, - typename NumTraits::type>::Scalar>::Real>::value> + ::type>::Scalar, + typename NumTraits::type>::Scalar>::Real>::value> { public: typedef internal::auto_diff_special_op - <_DerType, !internal::is_same::type>::Scalar, - typename NumTraits::type>::Scalar>::Real>::value> Base; - typedef typename internal::remove_all<_DerType>::type DerType; + ::type>::Scalar, + typename NumTraits::type>::Scalar>::Real>::value> Base; + typedef typename internal::remove_all::type DerType; typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real Real; @@ -382,16 +382,16 @@ class AutoDiffScalar namespace internal { -template -struct auto_diff_special_op<_DerType, true> -// : auto_diff_scalar_op<_DerType, typename NumTraits::Real, +template +struct auto_diff_special_op +// : auto_diff_scalar_op::Real, // is_same::Real>::value> { - typedef typename remove_all<_DerType>::type DerType; + typedef typename remove_all::type DerType; typedef typename traits::Scalar Scalar; typedef typename NumTraits::Real Real; -// typedef auto_diff_scalar_op<_DerType, typename NumTraits::Real, +// typedef auto_diff_scalar_op::Real, // is_same::Real>::value> Base; // using Base::operator+; @@ -401,8 +401,8 @@ struct auto_diff_special_op<_DerType, true> // using Base::operator*; // using Base::operator*=; - const AutoDiffScalar<_DerType>& derived() const { return *static_cast*>(this); } - AutoDiffScalar<_DerType>& derived() { return *static_cast*>(this); } + const AutoDiffScalar& derived() const { return *static_cast*>(this); } + AutoDiffScalar& derived() { return *static_cast*>(this); } inline const AutoDiffScalar operator+(const Real& other) const @@ -410,12 +410,12 @@ struct auto_diff_special_op<_DerType, true> return AutoDiffScalar(derived().value() + other, derived().derivatives()); } - friend inline const AutoDiffScalar operator+(const Real& a, const AutoDiffScalar<_DerType>& b) + friend inline const AutoDiffScalar operator+(const Real& a, const AutoDiffScalar& b) { return AutoDiffScalar(a + b.value(), b.derivatives()); } - inline AutoDiffScalar<_DerType>& operator+=(const Real& other) + inline AutoDiffScalar& operator+=(const Real& other) { derived().value() += other; return derived(); @@ -431,22 +431,22 @@ struct auto_diff_special_op<_DerType, true> } friend inline const AutoDiffScalar >, DerType>::Type > - operator*(const Real& other, const AutoDiffScalar<_DerType>& a) + operator*(const Real& other, const AutoDiffScalar& a) { return AutoDiffScalar >, DerType>::Type >( a.value() * other, a.derivatives() * other); } - inline AutoDiffScalar<_DerType>& operator*=(const Scalar& other) + inline AutoDiffScalar& operator*=(const Scalar& other) { *this = *this * other; return derived(); } }; -template -struct auto_diff_special_op<_DerType, false> +template +struct auto_diff_special_op { void operator*() const; void operator-() const; diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h index d7672d7c9739ee2f0688d3ca93b80475c4263175..ce92f5bfd7c749f5d52b61c1973dcee70e63b2c9 100644 --- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h +++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h @@ -281,7 +281,7 @@ inline int MatrixPowerAtomic::getPadeDegree(long double normIminusT) #endif int degree = 3; for (; degree <= maxPadeDegree; ++degree) - if (normIminusT <= maxNormForPade[degree - 3]) + if (normIminusT <= static_cast(maxNormForPade[degree - 3])) break; return degree; } diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h index 6ab8f9714b66f2c6b83c75551513748ec7d6c0b6..59a15b098e2e6e655babaa6b5beccb68e4daefda 100644 --- a/unsupported/Eigen/src/Polynomials/Companion.h +++ b/unsupported/Eigen/src/Polynomials/Companion.h @@ -20,12 +20,6 @@ namespace internal { #ifndef EIGEN_PARSED_BY_DOXYGEN -template -T radix(){ return 2; } - -template -T radix2(){ return radix()*radix(); } - template struct decrement_if_fixed_size { @@ -141,7 +135,10 @@ inline bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm, bool& isBalanced, RealScalar& colB, RealScalar& rowB ) { - if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; } + if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm + || !(numext::isfinite)(colNorm) || !(numext::isfinite)(rowNorm)){ + return true; + } else { //To find the balancing coefficients, if the radix is 2, @@ -149,33 +146,41 @@ bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm, // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$ // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$ // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$ - rowB = rowNorm / radix(); + const RealScalar radix = RealScalar(2); + const RealScalar radix2 = RealScalar(4); + + rowB = rowNorm / radix; colB = RealScalar(1); const RealScalar s = colNorm + rowNorm; - while (colNorm < rowB) + // Find sigma s.t. rowNorm / 2 <= 2^(2*sigma) * colNorm + RealScalar scout = colNorm; + while (scout < rowB) { - colB *= radix(); - colNorm *= radix2(); + colB *= radix; + scout *= radix2; } - - rowB = rowNorm * radix(); - - while (colNorm >= rowB) + + // We now have an upper-bound for sigma, try to lower it. + // Find sigma s.t. 2^(2*sigma) * colNorm / 2 < rowNorm + scout = colNorm * (colB / radix) * colB; // Avoid overflow. + while (scout >= rowNorm) { - colB /= radix(); - colNorm /= radix2(); + colB /= radix; + scout /= radix2; } - //This line is used to avoid insubstantial balancing - if ((rowNorm + colNorm) < RealScalar(0.95) * s * colB) + // This line is used to avoid insubstantial balancing. + if ((rowNorm + radix * scout) < RealScalar(0.95) * s * colB) { isBalanced = false; rowB = RealScalar(1) / colB; return false; } - else{ - return true; } + else + { + return true; + } } } diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h index f1c260e29450eb6b557e664138b8093d343a6f9e..243ffdd5e33f0af5feaecf7539dfa1f0129a3af4 100644 --- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h +++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h @@ -301,12 +301,9 @@ struct digamma_impl { This implementation works on both scalars and Ts. */ template -EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) { - // Clamp the inputs to the range [-4, 4] since anything outside - // this range is +/-1.0f in single-precision. - const T plus_4 = pset1(4.f); - const T minus_4 = pset1(-4.f); - const T x = pmax(pmin(a_x, plus_4), minus_4); +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) { + const float kErfInvOneMinusHalfULP = 3.832506856900711f; + const T clamp = pcmp_le(pset1(kErfInvOneMinusHalfULP), pabs(x)); // The monomial coefficients of the numerator polynomial (odd). const T alpha_1 = pset1(-1.60960333262415e-02f); const T alpha_3 = pset1(-2.95459980854025e-03f); @@ -342,7 +339,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) { q = pmadd(x2, q, beta_0); // Divide the numerator by the denominator. - return pdiv(p, q); + const T sign = pselect(pcmp_le(x, pset1(0.0f)), pset1(-1.0f), pset1(1.0f)); + return pselect(clamp, sign, pdiv(p, q)); } template @@ -473,9 +471,9 @@ struct erfc_impl { * ERROR MESSAGES: * * message condition value returned - * ndtri domain x <= 0 -MAXNUM - * ndtri domain x >= 1 MAXNUM - * + * ndtri domain x == 0 -INF + * ndtri domain x == 1 INF + * ndtri domain x < 0, x > 1 NAN */ /* Cephes Math Library Release 2.2: June, 1992 @@ -637,8 +635,8 @@ T generic_ndtri(const T& a) { generic_ndtri_lt_exp_neg_two(b, should_flipsign)); return pselect( - pcmp_le(a, zero), neg_maxnum, - pselect(pcmp_le(one, a), maxnum, ndtri)); + pcmp_eq(a, zero), neg_maxnum, + pselect(pcmp_eq(one, a), maxnum, ndtri)); } template @@ -1387,7 +1385,7 @@ struct zeta_impl { }; const Scalar maxnum = NumTraits::infinity(); - const Scalar zero = 0.0, half = 0.5, one = 1.0; + const Scalar zero = Scalar(0.0), half = Scalar(0.5), one = Scalar(1.0); const Scalar machep = cephes_helper::machep(); const Scalar nan = NumTraits::quiet_NaN(); @@ -1429,11 +1427,19 @@ struct zeta_impl { return s; } + // If b is zero, then the tail sum will also end up being zero. + // Exiting early here can prevent NaNs for some large inputs, where + // the tail sum computed below has term `a` which can overflow to `inf`. + if (numext::equal_strict(b, zero)) { + return s; + } + w = a; s += b*w/(x-one); s -= half * b; a = one; k = zero; + for( i=0; i<12; i++ ) { a *= x + k; diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h index 7dd3c3e5be6de84176da2459a860fec6c344c770..909b08e16503483b4d67ad0b43c214e12cefcc99 100644 --- a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h +++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h @@ -4,6 +4,9 @@ namespace Eigen { namespace internal { +// Bessel functions only available for some compilers. +#if EIGEN_HAS_AVX512_MATH + F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0) @@ -40,6 +43,8 @@ BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0) F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1) BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1) +#endif + } // namespace internal } // namespace Eigen diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt index bef4f1925c6b0fb0666b8895883af8b315cbdbe1..1d0f721dc1228061edad1ba71680713fb38c7764 100644 --- a/unsupported/doc/examples/SYCL/CMakeLists.txt +++ b/unsupported/doc/examples/SYCL/CMakeLists.txt @@ -3,8 +3,7 @@ FILE(GLOB examples_SRCS "*.cpp") set(EIGEN_SYCL ON) list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread) if(EIGEN_SYCL_TRISYCL) - set(CMAKE_CXX_STANDARD 14) - set(STD_CXX_FLAG "-std=c++1z") + set(CMAKE_CXX_STANDARD 17) else(EIGEN_SYCL_TRISYCL) if(MSVC) # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11 diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 181919361e5d3a34c06828f8b62084255ce3bccc..e917dffb5cd7a3df3dea16f2e2c22de957aa74c0 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -55,13 +55,11 @@ ei_add_test(FFT) ei_add_test(EulerAngles) -find_package(MPFR 2.3.0) -find_package(GMP) -if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11) - include_directories(${MPFR_INCLUDES} ./mpreal) +find_package(MPREAL) +if(MPREAL_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11) ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ") - set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES}) - ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" ) + include_directories(${MPREAL_INCLUDES}) + ei_add_test(mpreal_support "-std=c++11" "${MPREAL_LIBRARIES}" ) else() ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ") endif() @@ -165,8 +163,8 @@ if(EIGEN_TEST_CXX11) endif() if(EIGEN_SYCL_TRISYCL) - set(CMAKE_CXX_STANDARD 14) - set(STD_CXX_FLAG "-std=c++1z") + # triSYCL now requires c++17. + set(CMAKE_CXX_STANDARD 17) else() if(MSVC) # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11 @@ -292,8 +290,14 @@ endif() endif() # These tests needs nvcc -find_package(CUDA 7.0) -if(CUDA_FOUND AND EIGEN_TEST_CUDA) +check_language(CUDA) +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) +else() + message(STATUS "Could NOT find CUDA.") +endif() + +if(CMAKE_CUDA_COMPILER AND EIGEN_TEST_CUDA) # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor # and -fno-check-new flags since they trigger thousands of compilation warnings # in the CUDA runtime @@ -304,30 +308,23 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") - message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS}) - - if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") - set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE) - endif() if(EIGEN_TEST_CUDA_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}") foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}") endforeach() + string(APPEND CMAKE_CXX_FLAGS " ${EIGEN_CUDA_CXX_FLAGS}") + else() + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + set(NVCC_ARCH_FLAGS) + foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") + endforeach() + set(CUDA_NVCC_FLAGS "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}") + cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") endif() - - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr") - if (${CUDA_VERSION} STREQUAL "7.0") - set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") - endif() - - set(NVCC_ARCH_FLAGS) - foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH) - string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}") - endforeach() - set(CUDA_NVCC_FLAGS "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}") - cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") + set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(cxx11_tensor_complex_gpu) diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp index c667b7247ddcd72d8bbac7150f4c024f2748cfcb..b6c29ca4daa4be2556acec3faf77414031a59e21 100644 --- a/unsupported/test/NonLinearOptimization.cpp +++ b/unsupported/test/NonLinearOptimization.cpp @@ -12,14 +12,10 @@ // It is intended to be done for this test only. #include -// tolerance for chekcing number of iterations -#define LM_EVAL_COUNT_TOL 4/3 +// tolerance for checking number of iterations +#define LM_EVAL_COUNT_TOL 2 #define LM_CHECK_N_ITERS(SOLVER,NFEV,NJEV) { \ - ++g_test_level; \ - VERIFY_IS_EQUAL(SOLVER.nfev, NFEV); \ - VERIFY_IS_EQUAL(SOLVER.njev, NJEV); \ - --g_test_level; \ VERIFY(SOLVER.nfev <= NFEV * LM_EVAL_COUNT_TOL); \ VERIFY(SOLVER.njev <= NJEV * LM_EVAL_COUNT_TOL); \ } @@ -186,9 +182,10 @@ void testLmder1() lmder_functor functor; LevenbergMarquardt lm(functor); info = lm.lmder1(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm @@ -214,9 +211,10 @@ void testLmder() lmder_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm @@ -298,9 +296,10 @@ void testHybrj1() hybrj_functor functor; HybridNonLinearSolver solver(functor); info = solver.hybrj1(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(solver, 11, 1); // check norm @@ -332,9 +331,10 @@ void testHybrj() solver.diag.setConstant(n, 1.); solver.useExternalScaling = true; info = solver.solve(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(solver, 11, 1); // check norm @@ -385,10 +385,11 @@ void testHybrd1() hybrd_functor functor; HybridNonLinearSolver solver(functor); info = solver.hybrd1(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(solver.nfev, 20); + // VERIFY_IS_EQUAL(info, 1); + VERIFY(solver.nfev <= 20*LM_EVAL_COUNT_TOL); // check norm VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08); @@ -416,10 +417,11 @@ void testHybrd() solver.diag.setConstant(n, 1.); solver.useExternalScaling = true; info = solver.solveNumericalDiff(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(solver.nfev, 14); + // VERIFY_IS_EQUAL(info, 1); + VERIFY(solver.nfev <= 14*LM_EVAL_COUNT_TOL); // check norm VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08); @@ -487,9 +489,10 @@ void testLmstr1() lmstr_functor functor; LevenbergMarquardt lm(functor); info = lm.lmstr1(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm @@ -515,9 +518,10 @@ void testLmstr() lmstr_functor functor; LevenbergMarquardt lm(functor); info = lm.minimizeOptimumStorage(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm @@ -570,10 +574,11 @@ void testLmdif1() lmdif_functor functor; DenseIndex nfev = -1; // initialize to avoid maybe-uninitialized warning info = LevenbergMarquardt::lmdif1(functor, x, &nfev); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(nfev, 26); + // VERIFY_IS_EQUAL(info, 1); + VERIFY( nfev <= 26*LM_EVAL_COUNT_TOL); // check norm functor(x, fvec); @@ -601,10 +606,11 @@ void testLmdif() NumericalDiff numDiff(functor); LevenbergMarquardt > lm(numDiff); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev, 26); + // VERIFY_IS_EQUAL(info, 1); + VERIFY(lm.nfev <= 26*LM_EVAL_COUNT_TOL); // check norm fnorm = lm.fvec.blueNorm(); @@ -686,9 +692,10 @@ void testNistChwirut2(void) chwirut2_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 10, 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02); @@ -706,9 +713,10 @@ void testNistChwirut2(void) lm.parameters.ftol = 1.E6*NumTraits::epsilon(); lm.parameters.xtol = 1.E6*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 7, 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02); @@ -764,9 +772,10 @@ void testNistMisra1a(void) misra1a_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 19, 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01); @@ -780,9 +789,10 @@ void testNistMisra1a(void) x<< 250., 0.0005; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 5, 4); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01); @@ -852,9 +862,10 @@ void testNistHahn1(void) hahn1_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 11, 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00); @@ -873,9 +884,10 @@ void testNistHahn1(void) x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 11, 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00); @@ -936,9 +948,10 @@ void testNistMisra1d(void) misra1d_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 3); + // VERIFY_IS_EQUAL(info, 3); LM_CHECK_N_ITERS(lm, 9, 7); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02); @@ -952,9 +965,10 @@ void testNistMisra1d(void) x<< 450., 0.0003; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 4, 3); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02); @@ -1012,13 +1026,14 @@ void testNistLanczos1(void) lanczos1_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 2); + // VERIFY_IS_EQUAL(info, 2); LM_CHECK_N_ITERS(lm, 79, 72); // check norm^2 - std::cout.precision(30); - std::cout << lm.fvec.squaredNorm() << "\n"; + // std::cout.precision(30); + // std::cout << lm.fvec.squaredNorm() << "\n"; VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25); // check x VERIFY_IS_APPROX(x[0], 9.5100000027E-02); @@ -1034,9 +1049,10 @@ void testNistLanczos1(void) x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 2); + // VERIFY_IS_EQUAL(info, 2); LM_CHECK_N_ITERS(lm, 9, 8); // check norm^2 VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25); @@ -1098,9 +1114,10 @@ void testNistRat42(void) rat42_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 10, 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00); @@ -1115,9 +1132,10 @@ void testNistRat42(void) x<< 75., 2.5, 0.07; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 6, 5); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00); @@ -1174,9 +1192,10 @@ void testNistMGH10(void) MGH10_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 2); + // VERIFY_IS_EQUAL(info, 2); LM_CHECK_N_ITERS(lm, 284, 249); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01); @@ -1191,9 +1210,10 @@ void testNistMGH10(void) x<< 0.02, 4000., 250.; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 3); + // VERIFY_IS_EQUAL(info, 3); LM_CHECK_N_ITERS(lm, 126, 116); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01); @@ -1251,9 +1271,10 @@ void testNistBoxBOD(void) lm.parameters.xtol = 1.E6*NumTraits::epsilon(); lm.parameters.factor = 10.; info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 31, 25); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03); @@ -1270,10 +1291,11 @@ void testNistBoxBOD(void) lm.parameters.ftol = NumTraits::epsilon(); lm.parameters.xtol = NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - LM_CHECK_N_ITERS(lm, 15, 14); + // VERIFY_IS_EQUAL(info, 1); + LM_CHECK_N_ITERS(lm, 20, 14); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03); // check x @@ -1331,6 +1353,7 @@ void testNistMGH17(void) lm.parameters.xtol = NumTraits::epsilon(); lm.parameters.maxfev = 1000; info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05); @@ -1342,7 +1365,7 @@ void testNistMGH17(void) VERIFY_IS_APPROX(x[4], 2.2122699662E-02); // check return value - VERIFY_IS_EQUAL(info, 2); + // VERIFY_IS_EQUAL(info, 2); LM_CHECK_N_ITERS(lm, 602, 545); /* @@ -1352,9 +1375,10 @@ void testNistMGH17(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 18, 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05); @@ -1417,9 +1441,10 @@ void testNistMGH09(void) LevenbergMarquardt lm(functor); lm.parameters.maxfev = 1000; info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 490, 376); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04); @@ -1436,9 +1461,10 @@ void testNistMGH09(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 18, 16); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04); @@ -1501,9 +1527,10 @@ void testNistBennett5(void) LevenbergMarquardt lm(functor); lm.parameters.maxfev = 1000; info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 758, 744); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04); @@ -1518,9 +1545,10 @@ void testNistBennett5(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 203, 192); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04); @@ -1587,9 +1615,10 @@ void testNistThurber(void) lm.parameters.ftol = 1.E4*NumTraits::epsilon(); lm.parameters.xtol = 1.E4*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 39,36); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03); @@ -1611,9 +1640,10 @@ void testNistThurber(void) lm.parameters.ftol = 1.E4*NumTraits::epsilon(); lm.parameters.xtol = 1.E4*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 29, 28); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03); @@ -1677,9 +1707,10 @@ void testNistRat43(void) lm.parameters.ftol = 1.E6*NumTraits::epsilon(); lm.parameters.xtol = 1.E6*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 27, 20); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03); @@ -1698,9 +1729,10 @@ void testNistRat43(void) lm.parameters.ftol = 1.E5*NumTraits::epsilon(); lm.parameters.xtol = 1.E5*NumTraits::epsilon(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 9, 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03); @@ -1760,9 +1792,10 @@ void testNistEckerle4(void) eckerle4_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 18, 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03); @@ -1777,9 +1810,10 @@ void testNistEckerle4(void) x<< 1.5, 5., 450.; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); LM_CHECK_N_ITERS(lm, 7, 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03); diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp index b2e26ebb7334e417bbd957b74a9ff9cd500f0ecd..d66a63e0fdf8aa8cb181cb08455e0622c96d65d4 100644 --- a/unsupported/test/cxx11_tensor_block_eval.cpp +++ b/unsupported/test/cxx11_tensor_block_eval.cpp @@ -244,7 +244,7 @@ static void test_eval_tensor_binary_with_unary_expr_block() { rhs.setRandom(); VerifyBlockEvaluator( - (lhs.square() + rhs.square()).sqrt(), + (lhs.abs() + rhs.abs()).sqrt(), [&dims]() { return RandomBlock(dims, 1, 10); }); } diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp index d3dab891f20d4f0f6eae7bc7b245d408df3f7a2c..cbd92c328e1cc2e6c7d6348b08e35f9d80a0ba21 100644 --- a/unsupported/test/cxx11_tensor_broadcasting.cpp +++ b/unsupported/test/cxx11_tensor_broadcasting.cpp @@ -256,6 +256,22 @@ static void test_simple_broadcasting_n_by_one() } } +template +static void test_size_one_broadcasting() +{ + Tensor tensor(1); + tensor.setRandom(); + array broadcasts = {64}; + Tensor broadcast; + broadcast = tensor.broadcast(broadcasts); + + VERIFY_IS_EQUAL(broadcast.dimension(0), broadcasts[0]); + + for (int i = 0; i < broadcasts[0]; ++i) { + VERIFY_IS_EQUAL(tensor(0), broadcast(i)); + } +} + template static void test_simple_broadcasting_one_by_n_by_one_1d() { @@ -328,4 +344,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting) CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d()); CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d()); CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d()); + CALL_SUBTEST(test_size_one_broadcasting()); + CALL_SUBTEST(test_size_one_broadcasting()); } diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu index 575bdc1f9594ab30d80887c9e0badc5c4f202b26..5abf2131dbbc71bf7fd376cf1b2910b873b69ddb 100644 --- a/unsupported/test/cxx11_tensor_contract_gpu.cu +++ b/unsupported/test/cxx11_tensor_contract_gpu.cu @@ -25,10 +25,6 @@ typedef Tensor::DimensionPair DimPair; template void test_gpu_contraction(int m_size, int k_size, int n_size) { - std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; - // with these dimensions, the output has 300 * 140 elements, which is - // more than 30 * 1024, which is the number of threads in blocks on - // a 15 SM GK110 GPU Tensor t_left(m_size, k_size); Tensor t_right(k_size, n_size); Tensor t_result(m_size, n_size); @@ -171,25 +167,45 @@ void test_gpu_contraction_n() { template void test_gpu_contraction_sizes() { - int m_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257 , 511, - 512, 513, 1023, 1024, 1025}; - - int n_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; - - int k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; - - for (int i = 0; i < 15; i++) { - for (int j = 0; j < 15; j++) { - for (int k = 0; k < 17; k++) { - test_gpu_contraction(m_sizes[i], n_sizes[j], k_sizes[k]); + int m_sizes[3][5] = {{ 31, 39, 63, 64, 65}, + {127, 129, 255, 257 , 511}, + {512, 513, 1023, 1024, 1025}}; + + int n_sizes[3][5] = {{ 31, 39, 63, 64, 65}, + {127, 129, 255, 257, 511}, + {512, 513, 1023, 1024, 1025}}; + + int k_sizes[3][6] = {{ 31, 39, 63, 64, 65, 95}, + { 96, 127, 129, 255, 257, 511}, + {512, 513, 725, 1023, 1024, 1025}}; + + // Some selection of specific cases. + // - m changes rows each iteration + // - n changes rows each 3 iterations + // - k changes rows each 9 iterations + // - within a row, advance once column each iteration + const int m_cols = 5; + const int n_cols = 5; + const int k_cols = 6; + int m_offset = 0; + int n_offset = 1; + int k_offset = 2; + for (int i = 0; i < 3; ++i) { + for (int j = 0; j < 3; ++j) { + for (int l = 0; l < 3; ++l) { + int m = m_sizes[l][m_offset]; + int n = n_sizes[j][n_offset]; + int k = k_sizes[i][k_offset]; + test_gpu_contraction(m, n, k); + n_offset = (n_offset + 1) % n_cols; + k_offset = (k_offset + 1) % k_cols; + } + m_offset = (m_offset + 1) % m_cols; + if (j < 2) { + n_offset = (n_offset + n_cols - 3) % n_cols; // Rewind 3. } } + k_offset = (k_offset + 2 * k_cols - 9) % k_cols; // Rewind 9. } } diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp index 169fc1898a6803a5ff7b617981e65068fd524ae8..27c284514e7bc78ad47abf480cd603ba20c866a9 100644 --- a/unsupported/test/cxx11_tensor_expr.cpp +++ b/unsupported/test/cxx11_tensor_expr.cpp @@ -305,10 +305,10 @@ void test_minmax_nan_propagation_templ() { const Scalar kNaN = std::numeric_limits::quiet_NaN(); const Scalar kInf = std::numeric_limits::infinity(); const Scalar kZero(0); - Tensor vec_all_nan(size); + Tensor vec_full_nan(size); Tensor vec_one_nan(size); Tensor vec_zero(size); - vec_all_nan.setConstant(kNaN); + vec_full_nan.setConstant(kNaN); vec_zero.setZero(); vec_one_nan.setZero(); vec_one_nan(size/2) = kNaN; @@ -330,12 +330,12 @@ void test_minmax_nan_propagation_templ() { // max(nan, 0) = nan // max(0, nan) = nan // max(0, 0) = 0 - verify_all_nan(vec_all_nan.template cwiseMax(kNaN)); - verify_all_nan(vec_all_nan.template cwiseMax(vec_all_nan)); - verify_all_nan(vec_all_nan.template cwiseMax(kZero)); - verify_all_nan(vec_all_nan.template cwiseMax(vec_zero)); + verify_all_nan(vec_full_nan.template cwiseMax(kNaN)); + verify_all_nan(vec_full_nan.template cwiseMax(vec_full_nan)); + verify_all_nan(vec_full_nan.template cwiseMax(kZero)); + verify_all_nan(vec_full_nan.template cwiseMax(vec_zero)); verify_all_nan(vec_zero.template cwiseMax(kNaN)); - verify_all_nan(vec_zero.template cwiseMax(vec_all_nan)); + verify_all_nan(vec_zero.template cwiseMax(vec_full_nan)); verify_all_zero(vec_zero.template cwiseMax(kZero)); verify_all_zero(vec_zero.template cwiseMax(vec_zero)); @@ -344,12 +344,12 @@ void test_minmax_nan_propagation_templ() { // max(nan, 0) = 0 // max(0, nan) = 0 // max(0, 0) = 0 - verify_all_nan(vec_all_nan.template cwiseMax(kNaN)); - verify_all_nan(vec_all_nan.template cwiseMax(vec_all_nan)); - verify_all_zero(vec_all_nan.template cwiseMax(kZero)); - verify_all_zero(vec_all_nan.template cwiseMax(vec_zero)); + verify_all_nan(vec_full_nan.template cwiseMax(kNaN)); + verify_all_nan(vec_full_nan.template cwiseMax(vec_full_nan)); + verify_all_zero(vec_full_nan.template cwiseMax(kZero)); + verify_all_zero(vec_full_nan.template cwiseMax(vec_zero)); verify_all_zero(vec_zero.template cwiseMax(kNaN)); - verify_all_zero(vec_zero.template cwiseMax(vec_all_nan)); + verify_all_zero(vec_zero.template cwiseMax(vec_full_nan)); verify_all_zero(vec_zero.template cwiseMax(kZero)); verify_all_zero(vec_zero.template cwiseMax(vec_zero)); @@ -358,12 +358,12 @@ void test_minmax_nan_propagation_templ() { // min(nan, 0) = nan // min(0, nan) = nan // min(0, 0) = 0 - verify_all_nan(vec_all_nan.template cwiseMin(kNaN)); - verify_all_nan(vec_all_nan.template cwiseMin(vec_all_nan)); - verify_all_nan(vec_all_nan.template cwiseMin(kZero)); - verify_all_nan(vec_all_nan.template cwiseMin(vec_zero)); + verify_all_nan(vec_full_nan.template cwiseMin(kNaN)); + verify_all_nan(vec_full_nan.template cwiseMin(vec_full_nan)); + verify_all_nan(vec_full_nan.template cwiseMin(kZero)); + verify_all_nan(vec_full_nan.template cwiseMin(vec_zero)); verify_all_nan(vec_zero.template cwiseMin(kNaN)); - verify_all_nan(vec_zero.template cwiseMin(vec_all_nan)); + verify_all_nan(vec_zero.template cwiseMin(vec_full_nan)); verify_all_zero(vec_zero.template cwiseMin(kZero)); verify_all_zero(vec_zero.template cwiseMin(vec_zero)); @@ -372,12 +372,12 @@ void test_minmax_nan_propagation_templ() { // min(nan, 0) = 0 // min(0, nan) = 0 // min(0, 0) = 0 - verify_all_nan(vec_all_nan.template cwiseMin(kNaN)); - verify_all_nan(vec_all_nan.template cwiseMin(vec_all_nan)); - verify_all_zero(vec_all_nan.template cwiseMin(kZero)); - verify_all_zero(vec_all_nan.template cwiseMin(vec_zero)); + verify_all_nan(vec_full_nan.template cwiseMin(kNaN)); + verify_all_nan(vec_full_nan.template cwiseMin(vec_full_nan)); + verify_all_zero(vec_full_nan.template cwiseMin(kZero)); + verify_all_zero(vec_full_nan.template cwiseMin(vec_zero)); verify_all_zero(vec_zero.template cwiseMin(kNaN)); - verify_all_zero(vec_zero.template cwiseMin(vec_all_nan)); + verify_all_zero(vec_zero.template cwiseMin(vec_full_nan)); verify_all_zero(vec_zero.template cwiseMin(kZero)); verify_all_zero(vec_zero.template cwiseMin(vec_zero)); @@ -397,13 +397,13 @@ void test_minmax_nan_propagation_templ() { VERIFY_IS_EQUAL(val(), kZero); // Test NaN propagation for tensor of all NaNs. - val = vec_all_nan.template minimum(); + val = vec_full_nan.template minimum(); VERIFY((numext::isnan)(val())); - val = vec_all_nan.template minimum(); + val = vec_full_nan.template minimum(); VERIFY_IS_EQUAL(val(), kInf); - val = vec_all_nan.template maximum(); + val = vec_full_nan.template maximum(); VERIFY((numext::isnan)(val())); - val = vec_all_nan.template maximum(); + val = vec_full_nan.template maximum(); VERIFY_IS_EQUAL(val(), -kInf); // Test NaN propagation for tensor with a single NaN. diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index 137d0d5969fbd6e3b545db01f6e0d09e7f59ed72..0a37c02937b263b5b0ec6540f46d3b77bea25c26 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -681,8 +681,8 @@ void test_gpu_digamma() expected_out(2) = Scalar(1.2561176684318); expected_out(3) = Scalar(2.398239129535781); expected_out(4) = Scalar(9.210340372392849); - expected_out(5) = std::numeric_limits::infinity(); - expected_out(6) = std::numeric_limits::infinity(); + expected_out(5) = std::numeric_limits::quiet_NaN(); + expected_out(6) = std::numeric_limits::quiet_NaN(); std::size_t bytes = in.size() * sizeof(Scalar); @@ -704,11 +704,8 @@ void test_gpu_digamma() assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - for (int i = 0; i < 5; ++i) { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } - for (int i = 5; i < 7; ++i) { - VERIFY_IS_EQUAL(out(i), expected_out(i)); + for (int i = 0; i < 7; ++i) { + VERIFY_IS_CWISE_APPROX(out(i), expected_out(i)); } gpuFree(d_in); @@ -741,7 +738,7 @@ void test_gpu_zeta() expected_out(0) = std::numeric_limits::infinity(); expected_out(1) = Scalar(1.61237534869); expected_out(2) = Scalar(0.234848505667); - expected_out(3) = Scalar(1.03086757337e-5); + expected_out(3) = std::numeric_limits::quiet_NaN(); expected_out(4) = Scalar(0.367879440865); expected_out(5) = Scalar(0.054102025820864097); @@ -769,13 +766,8 @@ void test_gpu_zeta() assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - VERIFY_IS_EQUAL(out(0), expected_out(0)); - VERIFY((std::isnan)(out(3))); - - for (int i = 1; i < 6; ++i) { - if (i != 3) { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_CWISE_APPROX(out(i), expected_out(i)); } gpuFree(d_in_x); @@ -1117,13 +1109,8 @@ void test_gpu_ndtri() assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - VERIFY_IS_EQUAL(out(0), expected_out(0)); - VERIFY((std::isnan)(out(3))); - - for (int i = 1; i < 6; ++i) { - if (i != 3) { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } + for (int i = 0; i < 6; ++i) { + VERIFY_IS_CWISE_APPROX(out(i), expected_out(i)); } gpuFree(d_in_x); @@ -1262,12 +1249,8 @@ void test_gpu_betainc() assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); - for (int i = 1; i < 125; ++i) { - if ((std::isnan)(expected_out(i))) { - VERIFY((std::isnan)(out(i))); - } else { - VERIFY_IS_APPROX(out(i), expected_out(i)); - } + for (int i = 0; i < 125; ++i) { + VERIFY_IS_CWISE_APPROX(out(i), expected_out(i)); } gpuFree(d_in_x); diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu index 062f76e26d40278004ff751937af7468b17c050c..e11782a794e1d04f862f42b7eeb75c06f5c7c023 100644 --- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu @@ -113,7 +113,7 @@ void test_gpu_unary() { gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); gpu_res_float.device(gpu_device) = gpu_float.abs(); - gpu_res_half.device(gpu_device) = gpu_float.cast().abs().cast(); + gpu_res_half.device(gpu_device) = gpu_float.cast().abs().template cast(); Tensor half_prec(num_elem); Tensor full_prec(num_elem); @@ -154,7 +154,7 @@ void test_gpu_elementwise() { gpu_float1.device(gpu_device) = gpu_float1.random(); gpu_float2.device(gpu_device) = gpu_float2.random(); gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1; - gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).cast(); + gpu_res_half.device(gpu_device) = ((gpu_float1.cast() + gpu_float2.cast()) * gpu_float1.cast()).template cast(); Tensor half_prec(num_elem); Tensor full_prec(num_elem); @@ -329,26 +329,22 @@ void test_gpu_reductions(int size1, int size2, int redux) { int num_elem = size1*size2; int result_size = (redux == 1 ? size1 : size2); - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half)); - Eigen::TensorMap, Eigen::Aligned> gpu_float1( - d_float1, size1, size2); - Eigen::TensorMap, Eigen::Aligned> gpu_float2( - d_float2, size1, size2); + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, size1, size2); Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half, result_size); Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float, result_size); - gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f; - gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f; + gpu_float.device(gpu_device) = gpu_float.random() * 2.0f; Eigen::array redux_dim = {redux}; - gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast(); - gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(redux_dim); + gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast(); + gpu_res_half.device(gpu_device) = gpu_float.cast().sum(redux_dim); Tensor half_prec(result_size); Tensor full_prec(result_size); @@ -361,8 +357,7 @@ void test_gpu_reductions(int size1, int size2, int redux) { VERIFY_IS_APPROX(full_prec(i), half_prec(i)); } - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } @@ -386,25 +381,21 @@ void test_gpu_full_reductions() { int size = 13; int num_elem = size*size; - float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float)); - float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float)); + float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float)); Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half)); - Eigen::TensorMap, Eigen::Aligned> gpu_float1( - d_float1, size, size); - Eigen::TensorMap, Eigen::Aligned> gpu_float2( - d_float2, size, size); + Eigen::TensorMap, Eigen::Aligned> gpu_float( + d_float, size, size); Eigen::TensorMap, Eigen::Aligned> gpu_res_half( d_res_half); Eigen::TensorMap, Eigen::Aligned> gpu_res_float( d_res_float); - gpu_float1.device(gpu_device) = gpu_float1.random(); - gpu_float2.device(gpu_device) = gpu_float2.random(); + gpu_float.device(gpu_device) = gpu_float.random(); - gpu_res_float.device(gpu_device) = gpu_float1.sum().cast(); - gpu_res_half.device(gpu_device) = gpu_float1.cast().sum(); + gpu_res_float.device(gpu_device) = gpu_float.sum().cast(); + gpu_res_half.device(gpu_device) = gpu_float.cast().sum(); Tensor half_prec; Tensor full_prec; @@ -414,16 +405,15 @@ void test_gpu_full_reductions() { VERIFY_IS_APPROX(full_prec(), half_prec()); - gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast(); - gpu_res_half.device(gpu_device) = gpu_float1.cast().maximum(); + gpu_res_float.device(gpu_device) = gpu_float.maximum().cast(); + gpu_res_half.device(gpu_device) = gpu_float.cast().maximum(); gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half)); gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half)); gpu_device.synchronize(); VERIFY_IS_APPROX(full_prec(), half_prec()); - gpu_device.deallocate(d_float1); - gpu_device.deallocate(d_float2); + gpu_device.deallocate(d_float); gpu_device.deallocate(d_res_half); gpu_device.deallocate(d_res_float); } @@ -454,8 +444,8 @@ void test_gpu_forced_evals() { gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f); gpu_res_float.device(gpu_device) = gpu_float.abs(); - gpu_res_half1.device(gpu_device) = gpu_float.cast().abs().eval().cast(); - gpu_res_half2.device(gpu_device) = gpu_float.cast().abs().broadcast(no_bcast).eval().cast(); + gpu_res_half1.device(gpu_device) = gpu_float.cast().abs().eval().template cast(); + gpu_res_half2.device(gpu_device) = gpu_float.cast().abs().broadcast(no_bcast).eval().template cast(); Tensor half_prec1(num_elem); Tensor half_prec2(num_elem); diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp index 2ec85d2d4805c66548c47879ed986bc6ef112d25..89a64c02172d1d259e88a163c3bba6aadd21c4a9 100644 --- a/unsupported/test/cxx11_tensor_shuffling.cpp +++ b/unsupported/test/cxx11_tensor_shuffling.cpp @@ -215,6 +215,59 @@ static void test_shuffle_unshuffle() } +template +static void test_empty_shuffling() +{ + Tensor tensor(2,3,0,7); + tensor.setRandom(); + array shuffles; + shuffles[0] = 0; + shuffles[1] = 1; + shuffles[2] = 2; + shuffles[3] = 3; + + Tensor no_shuffle; + no_shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2); + VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3); + VERIFY_IS_EQUAL(no_shuffle.dimension(2), 0); + VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 0; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); + } + } + } + } + + shuffles[0] = 2; + shuffles[1] = 3; + shuffles[2] = 1; + shuffles[3] = 0; + Tensor shuffle; + shuffle = tensor.shuffle(shuffles); + + VERIFY_IS_EQUAL(shuffle.dimension(0), 0); + VERIFY_IS_EQUAL(shuffle.dimension(1), 7); + VERIFY_IS_EQUAL(shuffle.dimension(2), 3); + VERIFY_IS_EQUAL(shuffle.dimension(3), 2); + + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 3; ++j) { + for (int k = 0; k < 0; ++k) { + for (int l = 0; l < 7; ++l) { + VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + } + } + } + } +} + + EIGEN_DECLARE_TEST(cxx11_tensor_shuffling) { CALL_SUBTEST(test_simple_shuffling()); @@ -225,4 +278,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_shuffling) CALL_SUBTEST(test_shuffling_as_value()); CALL_SUBTEST(test_shuffle_unshuffle()); CALL_SUBTEST(test_shuffle_unshuffle()); + CALL_SUBTEST(test_empty_shuffling()); + CALL_SUBTEST(test_empty_shuffling()); } diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp index 7f9a81cd3063ef98753bd51c4de5984685db88ad..d0748d13ace34a52228261f8c907472af75295a2 100644 --- a/unsupported/test/levenberg_marquardt.cpp +++ b/unsupported/test/levenberg_marquardt.cpp @@ -24,7 +24,7 @@ using std::sqrt; // tolerance for chekcing number of iterations -#define LM_EVAL_COUNT_TOL 4/3 +#define LM_EVAL_COUNT_TOL 2 struct lmder_functor : DenseFunctor { @@ -75,11 +75,11 @@ void testLmder1() lmder_functor functor; LevenbergMarquardt lm(functor); info = lm.lmder1(x); - + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 6); - VERIFY_IS_EQUAL(lm.njev(), 5); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 6); + // VERIFY_IS_EQUAL(lm.njev(), 5); // check norm VERIFY_IS_APPROX(lm.fvec().blueNorm(), 0.09063596); @@ -104,11 +104,12 @@ void testLmder() lmder_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 6); - VERIFY_IS_EQUAL(lm.njev(), 5); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 6); + // VERIFY_IS_EQUAL(lm.njev(), 5); // check norm fnorm = lm.fvec().blueNorm(); @@ -177,9 +178,10 @@ void testLmdif1() lmdif_functor functor; DenseIndex nfev; info = LevenbergMarquardt::lmdif1(functor, x, &nfev); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(nfev, 26); // check norm @@ -208,9 +210,10 @@ void testLmdif() NumericalDiff numDiff(functor); LevenbergMarquardt > lm(numDiff); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return values - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(lm.nfev(), 26); // check norm @@ -293,11 +296,12 @@ void testNistChwirut2(void) chwirut2_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(lm.nfev(), 10); - VERIFY_IS_EQUAL(lm.njev(), 8); + // VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02); // check x @@ -314,11 +318,12 @@ void testNistChwirut2(void) lm.setFtol(1.E6*NumTraits::epsilon()); lm.setXtol(1.E6*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(lm.nfev(), 7); - VERIFY_IS_EQUAL(lm.njev(), 6); + // VERIFY_IS_EQUAL(lm.njev(), 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02); // check x @@ -373,11 +378,12 @@ void testNistMisra1a(void) misra1a_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 19); - VERIFY_IS_EQUAL(lm.njev(), 15); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 19); + // VERIFY_IS_EQUAL(lm.njev(), 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01); // check x @@ -390,11 +396,12 @@ void testNistMisra1a(void) x<< 250., 0.0005; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 5); - VERIFY_IS_EQUAL(lm.njev(), 4); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 5); + // VERIFY_IS_EQUAL(lm.njev(), 4); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01); // check x @@ -464,11 +471,12 @@ void testNistHahn1(void) hahn1_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 11); - VERIFY_IS_EQUAL(lm.njev(), 10); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 11); + // VERIFY_IS_EQUAL(lm.njev(), 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00); // check x @@ -486,11 +494,12 @@ void testNistHahn1(void) x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(info, 1); // VERIFY_IS_EQUAL(lm.nfev(), 11); - VERIFY_IS_EQUAL(lm.njev(), 10); + // VERIFY_IS_EQUAL(lm.njev(), 10); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00); // check x @@ -550,11 +559,12 @@ void testNistMisra1d(void) misra1d_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 9); - VERIFY_IS_EQUAL(lm.njev(), 7); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 9); + // VERIFY_IS_EQUAL(lm.njev(), 7); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02); // check x @@ -567,11 +577,12 @@ void testNistMisra1d(void) x<< 450., 0.0003; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 4); - VERIFY_IS_EQUAL(lm.njev(), 3); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 4); + // VERIFY_IS_EQUAL(lm.njev(), 3); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02); // check x @@ -628,11 +639,12 @@ void testNistLanczos1(void) lanczos1_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); - VERIFY_IS_EQUAL(lm.nfev(), 79); - VERIFY_IS_EQUAL(lm.njev(), 72); + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); + // VERIFY_IS_EQUAL(lm.nfev(), 79); + // VERIFY_IS_EQUAL(lm.njev(), 72); // check norm^2 VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25); // check x @@ -649,11 +661,12 @@ void testNistLanczos1(void) x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); - VERIFY_IS_EQUAL(lm.nfev(), 9); - VERIFY_IS_EQUAL(lm.njev(), 8); + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall); + // VERIFY_IS_EQUAL(lm.nfev(), 9); + // VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25); // check x @@ -714,11 +727,12 @@ void testNistRat42(void) rat42_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); - VERIFY_IS_EQUAL(lm.nfev(), 10); - VERIFY_IS_EQUAL(lm.njev(), 8); + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // VERIFY_IS_EQUAL(lm.nfev(), 10); + // VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00); // check x @@ -732,11 +746,12 @@ void testNistRat42(void) x<< 75., 2.5, 0.07; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); - VERIFY_IS_EQUAL(lm.nfev(), 6); - VERIFY_IS_EQUAL(lm.njev(), 5); + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // VERIFY_IS_EQUAL(lm.nfev(), 6); + // VERIFY_IS_EQUAL(lm.njev(), 5); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00); // check x @@ -787,14 +802,15 @@ void testNistMGH10(void) /* * First try */ - x<< 2., 400000., 25000.; + x << 2., 400000., 25000.; // do the computation MGH10_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); - ++g_test_level; - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); - --g_test_level; + EIGEN_UNUSED_VARIABLE(info) + // ++g_test_level; + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // --g_test_level; // was: VERIFY_IS_EQUAL(info, 1); // check norm^2 @@ -805,11 +821,11 @@ void testNistMGH10(void) VERIFY_IS_APPROX(x[2], 3.4522363462E+02); // check return value - - ++g_test_level; - VERIFY_IS_EQUAL(lm.nfev(), 284 ); - VERIFY_IS_EQUAL(lm.njev(), 249 ); - --g_test_level; + + // ++g_test_level; + // VERIFY_IS_EQUAL(lm.nfev(), 284 ); + // VERIFY_IS_EQUAL(lm.njev(), 249 ); + // --g_test_level; VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL); VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL); @@ -819,11 +835,12 @@ void testNistMGH10(void) x<< 0.02, 4000., 250.; // do the computation info = lm.minimize(x); - ++g_test_level; - VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); - // was: VERIFY_IS_EQUAL(info, 1); - --g_test_level; - + EIGEN_UNUSED_VARIABLE(info) + // ++g_test_level; + // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall); + // // was: VERIFY_IS_EQUAL(info, 1); + // --g_test_level; + // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01); // check x @@ -832,10 +849,10 @@ void testNistMGH10(void) VERIFY_IS_APPROX(x[2], 3.4522363462E+02); // check return value - ++g_test_level; - VERIFY_IS_EQUAL(lm.nfev(), 126); - VERIFY_IS_EQUAL(lm.njev(), 116); - --g_test_level; + // ++g_test_level; + // VERIFY_IS_EQUAL(lm.nfev(), 126); + // VERIFY_IS_EQUAL(lm.njev(), 116); + // --g_test_level; VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL); VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL); } @@ -888,6 +905,7 @@ void testNistBoxBOD(void) lm.setXtol(1.E6*NumTraits::epsilon()); lm.setFactor(10); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03); @@ -896,9 +914,9 @@ void testNistBoxBOD(void) VERIFY_IS_APPROX(x[1], 5.4723748542E-01); // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY(lm.nfev() < 31); // 31 - VERIFY(lm.njev() < 25); // 25 + // VERIFY_IS_EQUAL(info, 1); + // VERIFY(lm.nfev() < 31); // 31 + // VERIFY(lm.njev() < 25); // 25 /* * Second try @@ -909,13 +927,14 @@ void testNistBoxBOD(void) lm.setFtol(NumTraits::epsilon()); lm.setXtol( NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - ++g_test_level; - VERIFY_IS_EQUAL(lm.nfev(), 16 ); - VERIFY_IS_EQUAL(lm.njev(), 15 ); - --g_test_level; + // VERIFY_IS_EQUAL(info, 1); + // ++g_test_level; + // VERIFY_IS_EQUAL(lm.nfev(), 16 ); + // VERIFY_IS_EQUAL(lm.njev(), 15 ); + // --g_test_level; VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL); VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL); // check norm^2 @@ -975,6 +994,7 @@ void testNistMGH17(void) lm.setXtol(NumTraits::epsilon()); lm.setMaxfev(1000); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05); @@ -987,8 +1007,8 @@ void testNistMGH17(void) // check return value // VERIFY_IS_EQUAL(info, 2); //FIXME Use (lm.info() == Success) - VERIFY(lm.nfev() < 700 ); // 602 - VERIFY(lm.njev() < 600 ); // 545 + // VERIFY(lm.nfev() < 700 ); // 602 + // VERIFY(lm.njev() < 600 ); // 545 /* * Second try @@ -997,11 +1017,12 @@ void testNistMGH17(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 18); - VERIFY_IS_EQUAL(lm.njev(), 15); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 18); + // VERIFY_IS_EQUAL(lm.njev(), 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05); // check x @@ -1063,6 +1084,7 @@ void testNistMGH09(void) LevenbergMarquardt lm(functor); lm.setMaxfev(1000); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04); @@ -1072,9 +1094,9 @@ void testNistMGH09(void) VERIFY_IS_APPROX(x[2], 0.12305309914); // should be 1.2305650693E-01 VERIFY_IS_APPROX(x[3], 0.13605395375); // should be 1.3606233068E-01 // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY(lm.nfev() < 510 ); // 490 - VERIFY(lm.njev() < 400 ); // 376 + // VERIFY_IS_EQUAL(info, 1); + // VERIFY(lm.nfev() < 510 ); // 490 + // VERIFY(lm.njev() < 400 ); // 376 /* * Second try @@ -1083,11 +1105,12 @@ void testNistMGH09(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 18); - VERIFY_IS_EQUAL(lm.njev(), 16); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 18); + // VERIFY_IS_EQUAL(lm.njev(), 16); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04); // check x @@ -1149,11 +1172,12 @@ void testNistBennett5(void) LevenbergMarquardt lm(functor); lm.setMaxfev(1000); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 758); - VERIFY_IS_EQUAL(lm.njev(), 744); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 758); + // VERIFY_IS_EQUAL(lm.njev(), 744); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04); // check x @@ -1167,11 +1191,12 @@ void testNistBennett5(void) // do the computation lm.resetParameters(); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 203); - VERIFY_IS_EQUAL(lm.njev(), 192); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 203); + // VERIFY_IS_EQUAL(lm.njev(), 192); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04); // check x @@ -1237,11 +1262,12 @@ void testNistThurber(void) lm.setFtol(1.E4*NumTraits::epsilon()); lm.setXtol(1.E4*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 39); - VERIFY_IS_EQUAL(lm.njev(), 36); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 39); + // VERIFY_IS_EQUAL(lm.njev(), 36); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03); // check x @@ -1262,11 +1288,12 @@ void testNistThurber(void) lm.setFtol(1.E4*NumTraits::epsilon()); lm.setXtol(1.E4*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 29); - VERIFY_IS_EQUAL(lm.njev(), 28); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 29); + // VERIFY_IS_EQUAL(lm.njev(), 28); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03); // check x @@ -1329,11 +1356,12 @@ void testNistRat43(void) lm.setFtol(1.E6*NumTraits::epsilon()); lm.setXtol(1.E6*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 27); - VERIFY_IS_EQUAL(lm.njev(), 20); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 27); + // VERIFY_IS_EQUAL(lm.njev(), 20); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03); // check x @@ -1351,11 +1379,12 @@ void testNistRat43(void) lm.setFtol(1.E5*NumTraits::epsilon()); lm.setXtol(1.E5*NumTraits::epsilon()); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 9); - VERIFY_IS_EQUAL(lm.njev(), 8); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 9); + // VERIFY_IS_EQUAL(lm.njev(), 8); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03); // check x @@ -1414,11 +1443,12 @@ void testNistEckerle4(void) eckerle4_functor functor; LevenbergMarquardt lm(functor); info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 18); - VERIFY_IS_EQUAL(lm.njev(), 15); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 18); + // VERIFY_IS_EQUAL(lm.njev(), 15); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03); // check x @@ -1432,11 +1462,12 @@ void testNistEckerle4(void) x<< 1.5, 5., 450.; // do the computation info = lm.minimize(x); + EIGEN_UNUSED_VARIABLE(info) // check return value - VERIFY_IS_EQUAL(info, 1); - VERIFY_IS_EQUAL(lm.nfev(), 7); - VERIFY_IS_EQUAL(lm.njev(), 6); + // VERIFY_IS_EQUAL(info, 1); + // VERIFY_IS_EQUAL(lm.nfev(), 7); + // VERIFY_IS_EQUAL(lm.njev(), 6); // check norm^2 VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03); // check x diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp index dbaf9dbdf70475e6e070b2be8a72e88f6aa7a90d..ab1a030b8ed4b8be05863bbdde797e96348ad102 100644 --- a/unsupported/test/matrix_power.cpp +++ b/unsupported/test/matrix_power.cpp @@ -171,7 +171,7 @@ EIGEN_DECLARE_TEST(matrix_power) CALL_SUBTEST_5(testGeneral(Matrix3cf(), 1e-4f)); CALL_SUBTEST_8(testGeneral(Matrix4f(), 1e-4f)); CALL_SUBTEST_6(testGeneral(MatrixXf(2,2), 1e-3f)); // see bug 614 - CALL_SUBTEST_9(testGeneral(MatrixXe(7,7), 1e-13L)); + CALL_SUBTEST_9(testGeneral(MatrixXe(7,7), 1e-12L)); CALL_SUBTEST_10(testGeneral(Matrix3d(), 1e-13)); CALL_SUBTEST_11(testGeneral(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testGeneral(Matrix3e(), 1e-13L)); @@ -184,7 +184,7 @@ EIGEN_DECLARE_TEST(matrix_power) CALL_SUBTEST_5(testSingular(Matrix3cf(), 1e-4f)); CALL_SUBTEST_8(testSingular(Matrix4f(), 1e-4f)); CALL_SUBTEST_6(testSingular(MatrixXf(2,2), 1e-3f)); - CALL_SUBTEST_9(testSingular(MatrixXe(7,7), 1e-13L)); + CALL_SUBTEST_9(testSingular(MatrixXe(7,7), 1e-12L)); CALL_SUBTEST_10(testSingular(Matrix3d(), 1e-13)); CALL_SUBTEST_11(testSingular(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testSingular(Matrix3e(), 1e-13L)); @@ -197,7 +197,7 @@ EIGEN_DECLARE_TEST(matrix_power) CALL_SUBTEST_5(testLogThenExp(Matrix3cf(), 1e-4f)); CALL_SUBTEST_8(testLogThenExp(Matrix4f(), 1e-4f)); CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2), 1e-3f)); - CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7), 1e-13L)); + CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7), 1e-12L)); CALL_SUBTEST_10(testLogThenExp(Matrix3d(), 1e-13)); CALL_SUBTEST_11(testLogThenExp(Matrix3f(), 1e-4f)); CALL_SUBTEST_12(testLogThenExp(Matrix3e(), 1e-13L)); diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h deleted file mode 100644 index 5cfd66a107551960ce4157939c46eb8e77194da2..0000000000000000000000000000000000000000 --- a/unsupported/test/mpreal/mpreal.h +++ /dev/null @@ -1,3184 +0,0 @@ -/* - MPFR C++: Multi-precision floating point number class for C++. - Based on MPFR library: http://mpfr.org - - Project homepage: http://www.holoborodko.com/pavel/mpfr - Contact e-mail: pavel@holoborodko.com - - Copyright (c) 2008-2016 Pavel Holoborodko - - Contributors: - Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman, - Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen, - Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng, - Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood, - Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow, - Rodney James, Jorge Leitao, Jerome Benoit. - - Licensing: - (A) MPFR C++ is under GNU General Public License ("GPL"). - - (B) Non-free licenses may also be purchased from the author, for users who - do not want their programs protected by the GPL. - - The non-free licenses are for users that wish to use MPFR C++ in - their products but are unwilling to release their software - under the GPL (which would require them to release source code - and allow free redistribution). - - Such users can purchase an unlimited-use license from the author. - Contact us for more details. - - GNU General Public License ("GPL") copyright permissions statement: - ************************************************************************** - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . -*/ - -#ifndef __MPREAL_H__ -#define __MPREAL_H__ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// Options -#define MPREAL_HAVE_MSVC_DEBUGVIEW // Enable Debugger Visualizer for "Debug" builds in MSVC. -#define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS // Enable extended std::numeric_limits specialization. - // Meaning that "digits", "round_style" and similar members are defined as functions, not constants. - // See std::numeric_limits at the end of the file for more information. - -// Library version -#define MPREAL_VERSION_MAJOR 3 -#define MPREAL_VERSION_MINOR 6 -#define MPREAL_VERSION_PATCHLEVEL 5 -#define MPREAL_VERSION_STRING "3.6.5" - -// Detect compiler using signatures from http://predef.sourceforge.net/ -#if defined(__GNUC__) && defined(__INTEL_COMPILER) - #define IsInf(x) isinf EIGEN_NOT_A_MACRO (x) // Intel ICC compiler on Linux - -#elif defined(_MSC_VER) // Microsoft Visual C++ - #define IsInf(x) (!_finite(x)) - -#else - #define IsInf(x) std::isinf EIGEN_NOT_A_MACRO (x) // GNU C/C++ (and/or other compilers), just hope for C99 conformance -#endif - -// A Clang feature extension to determine compiler features. -#ifndef __has_feature - #define __has_feature(x) 0 -#endif - -// Detect support for r-value references (move semantic). -// Move semantic should be enabled with great care in multi-threading environments, -// especially if MPFR uses custom memory allocators. -// Everything should be thread-safe and support passing ownership over thread boundary. -#if (__has_feature(cxx_rvalue_references) || \ - defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \ - (defined(_MSC_VER) && _MSC_VER >= 1600) && !defined(MPREAL_DISABLE_MOVE_SEMANTIC)) - - #define MPREAL_HAVE_MOVE_SUPPORT - - // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization - #define mpfr_is_initialized(x) (0 != (x)->_mpfr_d) - #define mpfr_set_uninitialized(x) ((x)->_mpfr_d = 0 ) -#endif - -// Detect support for explicit converters. -#if (__has_feature(cxx_explicit_conversions) || \ - (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \ - (defined(_MSC_VER) && _MSC_VER >= 1800) || \ - (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1300)) - - #define MPREAL_HAVE_EXPLICIT_CONVERTERS -#endif - -#define MPFR_USE_INTMAX_T // Enable 64-bit integer types - should be defined before mpfr.h - -#if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG) - #define MPREAL_MSVC_DEBUGVIEW_CODE DebugView = toString(); - #define MPREAL_MSVC_DEBUGVIEW_DATA std::string DebugView; -#else - #define MPREAL_MSVC_DEBUGVIEW_CODE - #define MPREAL_MSVC_DEBUGVIEW_DATA -#endif - -#include - -#if (MPFR_VERSION < MPFR_VERSION_NUM(3,0,0)) - #include // Needed for random() -#endif - -// Less important options -#define MPREAL_DOUBLE_BITS_OVERFLOW -1 // Triggers overflow exception during conversion to double if mpreal - // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits - // = -1 disables overflow checks (default) - -// Fast replacement for mpfr_set_zero(x, +1): -// (a) uses low-level data members, might not be forward compatible -// (b) sign is not set, add (x)->_mpfr_sign = 1; -#define mpfr_set_zero_fast(x) ((x)->_mpfr_exp = __MPFR_EXP_ZERO) - -#if defined(__GNUC__) - #define MPREAL_PERMISSIVE_EXPR __extension__ -#else - #define MPREAL_PERMISSIVE_EXPR -#endif - -namespace mpfr { - -class mpreal { -private: - mpfr_t mp; - -public: - - // Get default rounding mode & precision - inline static mp_rnd_t get_default_rnd() { return (mp_rnd_t)(mpfr_get_default_rounding_mode()); } - inline static mp_prec_t get_default_prec() { return (mpfr_get_default_prec)(); } - - // Constructors && type conversions - mpreal(); - mpreal(const mpreal& u); - mpreal(const mpf_t u); - mpreal(const mpz_t u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const mpq_t u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const double u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const long double u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const unsigned long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const unsigned int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd()); - - // Construct mpreal from mpfr_t structure. - // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers. - mpreal(const mpfr_t u, bool shared = false); - - mpreal(const char* s, mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd()); - mpreal(const std::string& s, mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd()); - - ~mpreal(); - -#ifdef MPREAL_HAVE_MOVE_SUPPORT - mpreal& operator=(mpreal&& v); - mpreal(mpreal&& u); -#endif - - // Operations - // = - // +, -, *, /, ++, --, <<, >> - // *=, +=, -=, /=, - // <, >, ==, <=, >= - - // = - mpreal& operator=(const mpreal& v); - mpreal& operator=(const mpf_t v); - mpreal& operator=(const mpz_t v); - mpreal& operator=(const mpq_t v); - mpreal& operator=(const long double v); - mpreal& operator=(const double v); - mpreal& operator=(const unsigned long int v); - mpreal& operator=(const unsigned long long int v); - mpreal& operator=(const long long int v); - mpreal& operator=(const unsigned int v); - mpreal& operator=(const long int v); - mpreal& operator=(const int v); - mpreal& operator=(const char* s); - mpreal& operator=(const std::string& s); - template mpreal& operator= (const std::complex& z); - - // + - mpreal& operator+=(const mpreal& v); - mpreal& operator+=(const mpf_t v); - mpreal& operator+=(const mpz_t v); - mpreal& operator+=(const mpq_t v); - mpreal& operator+=(const long double u); - mpreal& operator+=(const double u); - mpreal& operator+=(const unsigned long int u); - mpreal& operator+=(const unsigned int u); - mpreal& operator+=(const long int u); - mpreal& operator+=(const int u); - - mpreal& operator+=(const long long int u); - mpreal& operator+=(const unsigned long long int u); - mpreal& operator-=(const long long int u); - mpreal& operator-=(const unsigned long long int u); - mpreal& operator*=(const long long int u); - mpreal& operator*=(const unsigned long long int u); - mpreal& operator/=(const long long int u); - mpreal& operator/=(const unsigned long long int u); - - const mpreal operator+() const; - mpreal& operator++ (); - const mpreal operator++ (int); - - // - - mpreal& operator-=(const mpreal& v); - mpreal& operator-=(const mpz_t v); - mpreal& operator-=(const mpq_t v); - mpreal& operator-=(const long double u); - mpreal& operator-=(const double u); - mpreal& operator-=(const unsigned long int u); - mpreal& operator-=(const unsigned int u); - mpreal& operator-=(const long int u); - mpreal& operator-=(const int u); - const mpreal operator-() const; - friend const mpreal operator-(const unsigned long int b, const mpreal& a); - friend const mpreal operator-(const unsigned int b, const mpreal& a); - friend const mpreal operator-(const long int b, const mpreal& a); - friend const mpreal operator-(const int b, const mpreal& a); - friend const mpreal operator-(const double b, const mpreal& a); - mpreal& operator-- (); - const mpreal operator-- (int); - - // * - mpreal& operator*=(const mpreal& v); - mpreal& operator*=(const mpz_t v); - mpreal& operator*=(const mpq_t v); - mpreal& operator*=(const long double v); - mpreal& operator*=(const double v); - mpreal& operator*=(const unsigned long int v); - mpreal& operator*=(const unsigned int v); - mpreal& operator*=(const long int v); - mpreal& operator*=(const int v); - - // / - mpreal& operator/=(const mpreal& v); - mpreal& operator/=(const mpz_t v); - mpreal& operator/=(const mpq_t v); - mpreal& operator/=(const long double v); - mpreal& operator/=(const double v); - mpreal& operator/=(const unsigned long int v); - mpreal& operator/=(const unsigned int v); - mpreal& operator/=(const long int v); - mpreal& operator/=(const int v); - friend const mpreal operator/(const unsigned long int b, const mpreal& a); - friend const mpreal operator/(const unsigned int b, const mpreal& a); - friend const mpreal operator/(const long int b, const mpreal& a); - friend const mpreal operator/(const int b, const mpreal& a); - friend const mpreal operator/(const double b, const mpreal& a); - - //<<= Fast Multiplication by 2^u - mpreal& operator<<=(const unsigned long int u); - mpreal& operator<<=(const unsigned int u); - mpreal& operator<<=(const long int u); - mpreal& operator<<=(const int u); - - //>>= Fast Division by 2^u - mpreal& operator>>=(const unsigned long int u); - mpreal& operator>>=(const unsigned int u); - mpreal& operator>>=(const long int u); - mpreal& operator>>=(const int u); - - // Type Conversion operators - bool toBool ( ) const; - long toLong (mp_rnd_t mode = GMP_RNDZ) const; - unsigned long toULong (mp_rnd_t mode = GMP_RNDZ) const; - long long toLLong (mp_rnd_t mode = GMP_RNDZ) const; - unsigned long long toULLong (mp_rnd_t mode = GMP_RNDZ) const; - float toFloat (mp_rnd_t mode = GMP_RNDN) const; - double toDouble (mp_rnd_t mode = GMP_RNDN) const; - long double toLDouble (mp_rnd_t mode = GMP_RNDN) const; - -#if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS) - explicit operator bool () const { return toBool(); } - explicit operator int () const { return int(toLong()); } - explicit operator long () const { return toLong(); } - explicit operator long long () const { return toLLong(); } - explicit operator unsigned () const { return unsigned(toULong()); } - explicit operator unsigned long () const { return toULong(); } - explicit operator unsigned long long () const { return toULLong(); } - explicit operator float () const { return toFloat(); } - explicit operator double () const { return toDouble(); } - explicit operator long double () const { return toLDouble(); } -#endif - - // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions - ::mpfr_ptr mpfr_ptr(); - ::mpfr_srcptr mpfr_ptr() const; - ::mpfr_srcptr mpfr_srcptr() const; - - // Convert mpreal to string with n significant digits in base b - // n = -1 -> convert with the maximum available digits - std::string toString(int n = -1, int b = 10, mp_rnd_t mode = mpreal::get_default_rnd()) const; - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - std::string toString(const std::string& format) const; -#endif - - std::ostream& output(std::ostream& os) const; - - // Math Functions - friend const mpreal sqr (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sqrt(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sqrt(const unsigned long int v, mp_rnd_t rnd_mode); - friend const mpreal cbrt(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal root(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode); - friend const mpreal pow (const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode); - friend const mpreal pow (const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode); - friend const mpreal pow (const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode); - friend const mpreal pow (const mpreal& a, const long int b, mp_rnd_t rnd_mode); - friend const mpreal pow (const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode); - friend const mpreal pow (const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode); - friend const mpreal fabs(const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal abs(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode); - friend inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode); - friend inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode); - friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode); - friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode); - friend int cmpabs(const mpreal& a,const mpreal& b); - - friend const mpreal log (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal exp (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal expm1(const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal nextpow2(const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal cos(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sin(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal tan(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sec(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal csc(const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal cot(const mpreal& v, mp_rnd_t rnd_mode); - friend int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal acos (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal asin (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal atan (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode); - friend const mpreal acot (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal asec (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal acsc (const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal cosh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sinh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal tanh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal sech (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal csch (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal coth (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal acosh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal asinh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal atanh (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal acoth (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal asech (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal acsch (const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - - friend const mpreal fac_ui (unsigned long int v, mp_prec_t prec, mp_rnd_t rnd_mode); - friend const mpreal eint (const mpreal& v, mp_rnd_t rnd_mode); - - friend const mpreal gamma (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal tgamma (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal lngamma (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal lgamma (const mpreal& v, int *signp, mp_rnd_t rnd_mode); - friend const mpreal zeta (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal erf (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal erfc (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode); - friend const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode); - friend const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode); - friend const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode); - friend int sgn (const mpreal& v); - -// MPFR 2.4.0 Specifics -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - friend int sinh_cosh (mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal li2 (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - friend const mpreal rec_sqrt (const mpreal& v, mp_rnd_t rnd_mode); - - // MATLAB's semantic equivalents - friend const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Remainder after division - friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division -#endif - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) - friend const mpreal digamma (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal ai (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode); // use gmp_randinit_default() to init state, gmp_randclear() to clear -#endif - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0)) - friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode); // use gmp_randinit_default() to init state, gmp_randclear() to clear - friend const mpreal grandom (unsigned int seed); -#endif - - // Uniformly distributed random number generation in [0,1] using - // Mersenne-Twister algorithm by default. - // Use parameter to setup seed, e.g.: random((unsigned)time(NULL)) - // Check urandom() for more precise control. - friend const mpreal random(unsigned int seed); - - // Splits mpreal value into fractional and integer parts. - // Returns fractional part and stores integer part in n. - friend const mpreal modf(const mpreal& v, mpreal& n); - - // Constants - // don't forget to call mpfr_free_cache() for every thread where you are using const-functions - friend const mpreal const_log2 (mp_prec_t prec, mp_rnd_t rnd_mode); - friend const mpreal const_pi (mp_prec_t prec, mp_rnd_t rnd_mode); - friend const mpreal const_euler (mp_prec_t prec, mp_rnd_t rnd_mode); - friend const mpreal const_catalan (mp_prec_t prec, mp_rnd_t rnd_mode); - - // returns +inf iff sign>=0 otherwise -inf - friend const mpreal const_infinity(int sign, mp_prec_t prec); - - // Output/ Input - friend std::ostream& operator<<(std::ostream& os, const mpreal& v); - friend std::istream& operator>>(std::istream& is, mpreal& v); - - // Integer Related Functions - friend const mpreal rint (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal ceil (const mpreal& v); - friend const mpreal floor(const mpreal& v); - friend const mpreal round(const mpreal& v); - friend const mpreal trunc(const mpreal& v); - friend const mpreal rint_ceil (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal rint_floor (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal rint_round (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal rint_trunc (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal frac (const mpreal& v, mp_rnd_t rnd_mode); - friend const mpreal remainder ( const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - friend const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - - // Miscellaneous Functions - friend const mpreal nexttoward (const mpreal& x, const mpreal& y); - friend const mpreal nextabove (const mpreal& x); - friend const mpreal nextbelow (const mpreal& x); - - // use gmp_randinit_default() to init state, gmp_randclear() to clear - friend const mpreal urandomb (gmp_randstate_t& state); - -// MPFR < 2.4.2 Specifics -#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2)) - friend const mpreal random2 (mp_size_t size, mp_exp_t exp); -#endif - - // Instance Checkers - friend bool isnan EIGEN_NOT_A_MACRO (const mpreal& v); - friend bool (isinf) (const mpreal& v); - friend bool (isfinite) (const mpreal& v); - - friend bool isnum (const mpreal& v); - friend bool iszero (const mpreal& v); - friend bool isint (const mpreal& v); - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) - friend bool isregular(const mpreal& v); -#endif - - // Set/Get instance properties - inline mp_prec_t get_prec() const; - inline void set_prec(mp_prec_t prec, mp_rnd_t rnd_mode = get_default_rnd()); // Change precision with rounding mode - - // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex interface - inline mpreal& setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd()); - inline int getPrecision() const; - - // Set mpreal to +/- inf, NaN, +/-0 - mpreal& setInf (int Sign = +1); - mpreal& setNan (); - mpreal& setZero (int Sign = +1); - mpreal& setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd()); - - //Exponent - mp_exp_t get_exp() const; - int set_exp(mp_exp_t e); - int check_range (int t, mp_rnd_t rnd_mode = get_default_rnd()); - int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd()); - - // Inexact conversion from float - inline bool fits_in_bits(double x, int n); - - // Set/Get global properties - static void set_default_prec(mp_prec_t prec); - static void set_default_rnd(mp_rnd_t rnd_mode); - - static mp_exp_t get_emin (void); - static mp_exp_t get_emax (void); - static mp_exp_t get_emin_min (void); - static mp_exp_t get_emin_max (void); - static mp_exp_t get_emax_min (void); - static mp_exp_t get_emax_max (void); - static int set_emin (mp_exp_t exp); - static int set_emax (mp_exp_t exp); - - // Efficient swapping of two mpreal values - needed for std algorithms - friend void swap(mpreal& x, mpreal& y); - - friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); - -private: - // Human friendly Debug Preview in Visual Studio. - // Put one of these lines: - // - // mpfr::mpreal= ; Show value only - // mpfr::mpreal=, bits ; Show value & precision - // - // at the beginning of - // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat - MPREAL_MSVC_DEBUGVIEW_DATA - - // "Smart" resources deallocation. Checks if instance initialized before deletion. - void clear(::mpfr_ptr); -}; - -////////////////////////////////////////////////////////////////////////// -// Exceptions -class conversion_overflow : public std::exception { -public: - std::string why() { return "inexact conversion from floating point"; } -}; - -////////////////////////////////////////////////////////////////////////// -// Constructors & converters -// Default constructor: creates mp number and initializes it to 0. -inline mpreal::mpreal() -{ - mpfr_init2(mpfr_ptr(), mpreal::get_default_prec()); - mpfr_set_zero_fast(mpfr_ptr()); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const mpreal& u) -{ - mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr())); - mpfr_set (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -#ifdef MPREAL_HAVE_MOVE_SUPPORT -inline mpreal::mpreal(mpreal&& other) -{ - mpfr_set_uninitialized(mpfr_ptr()); // make sure "other" holds null-pointer (in uninitialized state) - mpfr_swap(mpfr_ptr(), other.mpfr_ptr()); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal& mpreal::operator=(mpreal&& other) -{ - if (this != &other) - { - mpfr_swap(mpfr_ptr(), other.mpfr_ptr()); // destructor for "other" will be called just afterwards - MPREAL_MSVC_DEBUGVIEW_CODE; - } - return *this; -} -#endif - -inline mpreal::mpreal(const mpfr_t u, bool shared) -{ - if(shared) - { - std::memcpy(mpfr_ptr(), u, sizeof(mpfr_t)); - } - else - { - mpfr_init2(mpfr_ptr(), mpfr_get_prec(u)); - mpfr_set (mpfr_ptr(), u, mpreal::get_default_rnd()); - } - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const mpf_t u) -{ - mpfr_init2(mpfr_ptr(),(mp_prec_t) mpf_get_prec(u)); // (gmp: mp_bitcnt_t) unsigned long -> long (mpfr: mp_prec_t) - mpfr_set_f(mpfr_ptr(),u,mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const mpz_t u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2(mpfr_ptr(), prec); - mpfr_set_z(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const mpq_t u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2(mpfr_ptr(), prec); - mpfr_set_q(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2(mpfr_ptr(), prec); - -#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1) - if(fits_in_bits(u, MPREAL_DOUBLE_BITS_OVERFLOW)) - { - mpfr_set_d(mpfr_ptr(), u, mode); - }else - throw conversion_overflow(); -#else - mpfr_set_d(mpfr_ptr(), u, mode); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_ld(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_uj(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_sj(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_ui(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_ui(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_si(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_si(mpfr_ptr(), u, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_str(mpfr_ptr(), s, base, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode) -{ - mpfr_init2 (mpfr_ptr(), prec); - mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode); - - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline void mpreal::clear(::mpfr_ptr x) -{ -#ifdef MPREAL_HAVE_MOVE_SUPPORT - if(mpfr_is_initialized(x)) -#endif - mpfr_clear(x); -} - -inline mpreal::~mpreal() -{ - clear(mpfr_ptr()); -} - -// internal namespace needed for template magic -namespace internal{ - - // Use SFINAE to restrict arithmetic operations instantiation only for numeric types - // This is needed for smooth integration with libraries based on expression templates, like Eigen. - // TODO: Do the same for boolean operators. - template struct result_type {}; - - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; - template <> struct result_type {typedef mpreal type;}; -} - -// + Addition -template -inline const typename internal::result_type::type - operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs; } - -template -inline const typename internal::result_type::type - operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs; } - -// - Subtraction -template -inline const typename internal::result_type::type - operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs; } - -template -inline const typename internal::result_type::type - operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs; } - -// * Multiplication -template -inline const typename internal::result_type::type - operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs; } - -template -inline const typename internal::result_type::type - operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs; } - -// / Division -template -inline const typename internal::result_type::type - operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs; } - -template -inline const typename internal::result_type::type - operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs; } - -////////////////////////////////////////////////////////////////////////// -// sqrt -const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal sqrt(const long int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal sqrt(const int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal sqrt(const long double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal sqrt(const double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -// abs -inline const mpreal abs(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()); - -////////////////////////////////////////////////////////////////////////// -// pow -const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); -inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); - -////////////////////////////////////////////////////////////////////////// -// Estimate machine epsilon for the given precision -// Returns smallest eps such that 1.0 + eps != 1.0 -inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec()); - -// Returns smallest eps such that x + eps != x (relative machine epsilon) -inline mpreal machine_epsilon(const mpreal& x); - -// Gives max & min values for the required precision, -// minval is 'safe' meaning 1 / minval does not overflow -// maxval is 'safe' meaning 1 / maxval does not underflow -inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec()); -inline mpreal maxval(mp_prec_t prec = mpreal::get_default_prec()); - -// 'Dirty' equality check 1: |a-b| < min{|a|,|b|} * eps -inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps); - -// 'Dirty' equality check 2: |a-b| < min{|a|,|b|} * eps( min{|a|,|b|} ) -inline bool isEqualFuzzy(const mpreal& a, const mpreal& b); - -// 'Bitwise' equality check -// maxUlps - a and b can be apart by maxUlps binary numbers. -inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps); - -////////////////////////////////////////////////////////////////////////// -// Convert precision in 'bits' to decimal digits and vice versa. -// bits = ceil(digits*log[2](10)) -// digits = floor(bits*log[10](2)) - -inline mp_prec_t digits2bits(int d); -inline int bits2digits(mp_prec_t b); - -////////////////////////////////////////////////////////////////////////// -// min, max -const mpreal (max)(const mpreal& x, const mpreal& y); -const mpreal (min)(const mpreal& x, const mpreal& y); - -////////////////////////////////////////////////////////////////////////// -// Implementation -////////////////////////////////////////////////////////////////////////// - -////////////////////////////////////////////////////////////////////////// -// Operators - Assignment -inline mpreal& mpreal::operator=(const mpreal& v) -{ - if (this != &v) - { - mp_prec_t tp = mpfr_get_prec( mpfr_srcptr()); - mp_prec_t vp = mpfr_get_prec(v.mpfr_srcptr()); - - if(tp != vp){ - clear(mpfr_ptr()); - mpfr_init2(mpfr_ptr(), vp); - } - - mpfr_set(mpfr_ptr(), v.mpfr_srcptr(), mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - } - return *this; -} - -inline mpreal& mpreal::operator=(const mpf_t v) -{ - mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const mpz_t v) -{ - mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const mpq_t v) -{ - mpfr_set_q(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const long double v) -{ - mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const double v) -{ -#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1) - if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW)) - { - mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd()); - }else - throw conversion_overflow(); -#else - mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd()); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const unsigned long int v) -{ - mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const unsigned int v) -{ - mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const unsigned long long int v) -{ - mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const long long int v) -{ - mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const long int v) -{ - mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const int v) -{ - mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd()); - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator=(const char* s) -{ - // Use other converters for more precise control on base & precision & rounding: - // - // mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode) - // mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode) - // - // Here we assume base = 10 and we use precision of target variable. - - mpfr_t t; - - mpfr_init2(t, mpfr_get_prec(mpfr_srcptr())); - - if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd())) - { - mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - } - - clear(t); - return *this; -} - -inline mpreal& mpreal::operator=(const std::string& s) -{ - // Use other converters for more precise control on base & precision & rounding: - // - // mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode) - // mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode) - // - // Here we assume base = 10 and we use precision of target variable. - - mpfr_t t; - - mpfr_init2(t, mpfr_get_prec(mpfr_srcptr())); - - if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd())) - { - mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - } - - clear(t); - return *this; -} - -template -inline mpreal& mpreal::operator= (const std::complex& z) -{ - return *this = z.real(); -} - -////////////////////////////////////////////////////////////////////////// -// + Addition -inline mpreal& mpreal::operator+=(const mpreal& v) -{ - mpfr_add(mpfr_ptr(), mpfr_srcptr(), v.mpfr_srcptr(), mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const mpf_t u) -{ - *this += mpreal(u); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const mpz_t u) -{ - mpfr_add_z(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const mpq_t u) -{ - mpfr_add_q(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+= (const long double u) -{ - *this += mpreal(u); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+= (const double u) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpfr_add_d(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); -#else - *this += mpreal(u); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const unsigned long int u) -{ - mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const unsigned int u) -{ - mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const long int u) -{ - mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const int u) -{ - mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator+=(const long long int u) { *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator+=(const unsigned long long int u){ *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator-=(const long long int u) { *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator-=(const unsigned long long int u){ *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator*=(const long long int u) { *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator*=(const unsigned long long int u){ *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator/=(const long long int u) { *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } -inline mpreal& mpreal::operator/=(const unsigned long long int u){ *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this; } - -inline const mpreal mpreal::operator+()const { return mpreal(*this); } - -inline const mpreal operator+(const mpreal& a, const mpreal& b) -{ - mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr()))); - mpfr_add(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd()); - return c; -} - -inline mpreal& mpreal::operator++() -{ - return *this += 1; -} - -inline const mpreal mpreal::operator++ (int) -{ - mpreal x(*this); - *this += 1; - return x; -} - -inline mpreal& mpreal::operator--() -{ - return *this -= 1; -} - -inline const mpreal mpreal::operator-- (int) -{ - mpreal x(*this); - *this -= 1; - return x; -} - -////////////////////////////////////////////////////////////////////////// -// - Subtraction -inline mpreal& mpreal::operator-=(const mpreal& v) -{ - mpfr_sub(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const mpz_t v) -{ - mpfr_sub_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const mpq_t v) -{ - mpfr_sub_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const long double v) -{ - *this -= mpreal(v); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const double v) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); -#else - *this -= mpreal(v); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const unsigned long int v) -{ - mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const unsigned int v) -{ - mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const long int v) -{ - mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator-=(const int v) -{ - mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline const mpreal mpreal::operator-()const -{ - mpreal u(*this); - mpfr_neg(u.mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd()); - return u; -} - -inline const mpreal operator-(const mpreal& a, const mpreal& b) -{ - mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr()))); - mpfr_sub(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd()); - return c; -} - -inline const mpreal operator-(const double b, const mpreal& a) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_d_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -#else - mpreal x(b, mpfr_get_prec(a.mpfr_ptr())); - x -= a; - return x; -#endif -} - -inline const mpreal operator-(const unsigned long int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator-(const unsigned int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator-(const long int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator-(const int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -////////////////////////////////////////////////////////////////////////// -// * Multiplication -inline mpreal& mpreal::operator*= (const mpreal& v) -{ - mpfr_mul(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const mpz_t v) -{ - mpfr_mul_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const mpq_t v) -{ - mpfr_mul_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const long double v) -{ - *this *= mpreal(v); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const double v) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); -#else - *this *= mpreal(v); -#endif - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const unsigned long int v) -{ - mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const unsigned int v) -{ - mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const long int v) -{ - mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator*=(const int v) -{ - mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline const mpreal operator*(const mpreal& a, const mpreal& b) -{ - mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr()))); - mpfr_mul(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd()); - return c; -} - -////////////////////////////////////////////////////////////////////////// -// / Division -inline mpreal& mpreal::operator/=(const mpreal& v) -{ - mpfr_div(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const mpz_t v) -{ - mpfr_div_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const mpq_t v) -{ - mpfr_div_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const long double v) -{ - *this /= mpreal(v); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const double v) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); -#else - *this /= mpreal(v); -#endif - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const unsigned long int v) -{ - mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const unsigned int v) -{ - mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const long int v) -{ - mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator/=(const int v) -{ - mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline const mpreal operator/(const mpreal& a, const mpreal& b) -{ - mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_srcptr()), mpfr_get_prec(b.mpfr_srcptr()))); - mpfr_div(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd()); - return c; -} - -inline const mpreal operator/(const unsigned long int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator/(const unsigned int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator/(const long int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator/(const int b, const mpreal& a) -{ - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -} - -inline const mpreal operator/(const double b, const mpreal& a) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - mpreal x(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_d_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd()); - return x; -#else - mpreal x(0, mpfr_get_prec(a.mpfr_ptr())); - x /= a; - return x; -#endif -} - -////////////////////////////////////////////////////////////////////////// -// Shifts operators - Multiplication/Division by power of 2 -inline mpreal& mpreal::operator<<=(const unsigned long int u) -{ - mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator<<=(const unsigned int u) -{ - mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast(u),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator<<=(const long int u) -{ - mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator<<=(const int u) -{ - mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),static_cast(u),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator>>=(const unsigned long int u) -{ - mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator>>=(const unsigned int u) -{ - mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast(u),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator>>=(const long int u) -{ - mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::operator>>=(const int u) -{ - mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),static_cast(u),mpreal::get_default_rnd()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline const mpreal operator<<(const mpreal& v, const unsigned long int k) -{ - return mul_2ui(v,k); -} - -inline const mpreal operator<<(const mpreal& v, const unsigned int k) -{ - return mul_2ui(v,static_cast(k)); -} - -inline const mpreal operator<<(const mpreal& v, const long int k) -{ - return mul_2si(v,k); -} - -inline const mpreal operator<<(const mpreal& v, const int k) -{ - return mul_2si(v,static_cast(k)); -} - -inline const mpreal operator>>(const mpreal& v, const unsigned long int k) -{ - return div_2ui(v,k); -} - -inline const mpreal operator>>(const mpreal& v, const long int k) -{ - return div_2si(v,k); -} - -inline const mpreal operator>>(const mpreal& v, const unsigned int k) -{ - return div_2ui(v,static_cast(k)); -} - -inline const mpreal operator>>(const mpreal& v, const int k) -{ - return div_2si(v,static_cast(k)); -} - -// mul_2ui -inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode) -{ - mpreal x(v); - mpfr_mul_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode); - return x; -} - -// mul_2si -inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode) -{ - mpreal x(v); - mpfr_mul_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode); - return x; -} - -inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode) -{ - mpreal x(v); - mpfr_div_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode); - return x; -} - -inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode) -{ - mpreal x(v); - mpfr_div_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode); - return x; -} - -////////////////////////////////////////////////////////////////////////// -//Relational operators - -// WARNING: -// -// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode: -// -// isnan(b) = (b != b) -// isnan(b) = !(b == b) (we use in code below) -// -// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC). -// Use std::isnan instead (C++11). - -inline bool operator > (const mpreal& a, const mpreal& b ){ return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator > (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 ); } -inline bool operator > (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 ); } - -inline bool operator >= (const mpreal& a, const mpreal& b ){ return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator >= (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 ); } -inline bool operator >= (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 ); } - -inline bool operator < (const mpreal& a, const mpreal& b ){ return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator < (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 ); } -inline bool operator < (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 ); } - -inline bool operator <= (const mpreal& a, const mpreal& b ){ return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator <= (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 ); } -inline bool operator <= (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 ); } - -inline bool operator == (const mpreal& a, const mpreal& b ){ return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 ); } -inline bool operator == (const mpreal& a, const unsigned long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const unsigned int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const long int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const int b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const long double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 ); } -inline bool operator == (const mpreal& a, const double b ){ return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 ); } - -inline bool operator != (const mpreal& a, const mpreal& b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const unsigned long int b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const unsigned int b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const long int b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const int b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const long double b ){ return !(a == b); } -inline bool operator != (const mpreal& a, const double b ){ return !(a == b); } - -inline bool isnan EIGEN_NOT_A_MACRO (const mpreal& op){ return (mpfr_nan_p (op.mpfr_srcptr()) != 0 ); } -inline bool (isinf) (const mpreal& op){ return (mpfr_inf_p (op.mpfr_srcptr()) != 0 ); } -inline bool (isfinite) (const mpreal& op){ return (mpfr_number_p (op.mpfr_srcptr()) != 0 ); } -inline bool iszero (const mpreal& op){ return (mpfr_zero_p (op.mpfr_srcptr()) != 0 ); } -inline bool isint (const mpreal& op){ return (mpfr_integer_p(op.mpfr_srcptr()) != 0 ); } - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) -inline bool isregular(const mpreal& op){ return (mpfr_regular_p(op.mpfr_srcptr()));} -#endif - -////////////////////////////////////////////////////////////////////////// -// Type Converters -inline bool mpreal::toBool ( ) const { return mpfr_zero_p (mpfr_srcptr()) == 0; } -inline long mpreal::toLong (mp_rnd_t mode) const { return mpfr_get_si (mpfr_srcptr(), mode); } -inline unsigned long mpreal::toULong (mp_rnd_t mode) const { return mpfr_get_ui (mpfr_srcptr(), mode); } -inline float mpreal::toFloat (mp_rnd_t mode) const { return mpfr_get_flt(mpfr_srcptr(), mode); } -inline double mpreal::toDouble (mp_rnd_t mode) const { return mpfr_get_d (mpfr_srcptr(), mode); } -inline long double mpreal::toLDouble(mp_rnd_t mode) const { return mpfr_get_ld (mpfr_srcptr(), mode); } -inline long long mpreal::toLLong (mp_rnd_t mode) const { return mpfr_get_sj (mpfr_srcptr(), mode); } -inline unsigned long long mpreal::toULLong (mp_rnd_t mode) const { return mpfr_get_uj (mpfr_srcptr(), mode); } - -inline ::mpfr_ptr mpreal::mpfr_ptr() { return mp; } -inline ::mpfr_srcptr mpreal::mpfr_ptr() const { return mp; } -inline ::mpfr_srcptr mpreal::mpfr_srcptr() const { return mp; } - -template -inline std::string toString(T t, std::ios_base & (*f)(std::ios_base&)) -{ - std::ostringstream oss; - oss << f << t; - return oss.str(); -} - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - -inline std::string mpreal::toString(const std::string& format) const -{ - char *s = NULL; - std::string out; - - if( !format.empty() ) - { - if(!(mpfr_asprintf(&s, format.c_str(), mpfr_srcptr()) < 0)) - { - out = std::string(s); - - mpfr_free_str(s); - } - } - - return out; -} - -#endif - -inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const -{ - // TODO: Add extended format specification (f, e, rounding mode) as it done in output operator - (void)b; - (void)mode; - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - - std::ostringstream format; - - int digits = (n >= 0) ? n : 2 + bits2digits(mpfr_get_prec(mpfr_srcptr())); - - format << "%." << digits << "RNg"; - - return toString(format.str()); - -#else - - char *s, *ns = NULL; - size_t slen, nslen; - mp_exp_t exp; - std::string out; - - if(mpfr_inf_p(mp)) - { - if(mpfr_sgn(mp)>0) return "+Inf"; - else return "-Inf"; - } - - if(mpfr_zero_p(mp)) return "0"; - if(mpfr_nan_p(mp)) return "NaN"; - - s = mpfr_get_str(NULL, &exp, b, 0, mp, mode); - ns = mpfr_get_str(NULL, &exp, b, (std::max)(0,n), mp, mode); - - if(s!=NULL && ns!=NULL) - { - slen = strlen(s); - nslen = strlen(ns); - if(nslen<=slen) - { - mpfr_free_str(s); - s = ns; - slen = nslen; - } - else { - mpfr_free_str(ns); - } - - // Make human eye-friendly formatting if possible - if (exp>0 && static_cast(exp)s+exp) ptr--; - - if(ptr==s+exp) out = std::string(s,exp+1); - else out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1); - - //out = string(s,exp+1)+'.'+string(s+exp+1); - } - else - { - // Remove zeros starting from right end - char* ptr = s+slen-1; - while (*ptr=='0' && ptr>s+exp-1) ptr--; - - if(ptr==s+exp-1) out = std::string(s,exp); - else out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1); - - //out = string(s,exp)+'.'+string(s+exp); - } - - }else{ // exp<0 || exp>slen - if(s[0]=='-') - { - // Remove zeros starting from right end - char* ptr = s+slen-1; - while (*ptr=='0' && ptr>s+1) ptr--; - - if(ptr==s+1) out = std::string(s,2); - else out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1); - - //out = string(s,2)+'.'+string(s+2); - } - else - { - // Remove zeros starting from right end - char* ptr = s+slen-1; - while (*ptr=='0' && ptr>s) ptr--; - - if(ptr==s) out = std::string(s,1); - else out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1); - - //out = string(s,1)+'.'+string(s+1); - } - - // Make final string - if(--exp) - { - if(exp>0) out += "e+"+mpfr::toString(exp,std::dec); - else out += "e"+mpfr::toString(exp,std::dec); - } - } - - mpfr_free_str(s); - return out; - }else{ - return "conversion error!"; - } -#endif -} - - -////////////////////////////////////////////////////////////////////////// -// I/O -inline std::ostream& mpreal::output(std::ostream& os) const -{ - std::ostringstream format; - const std::ios::fmtflags flags = os.flags(); - - format << ((flags & std::ios::showpos) ? "%+" : "%"); - if (os.precision() >= 0) - format << '.' << os.precision() << "R*" - << ((flags & std::ios::floatfield) == std::ios::fixed ? 'f' : - (flags & std::ios::floatfield) == std::ios::scientific ? 'e' : - 'g'); - else - format << "R*e"; - - char *s = NULL; - if(!(mpfr_asprintf(&s, format.str().c_str(), - mpfr::mpreal::get_default_rnd(), - mpfr_srcptr()) - < 0)) - { - os << std::string(s); - mpfr_free_str(s); - } - return os; -} - -inline std::ostream& operator<<(std::ostream& os, const mpreal& v) -{ - return v.output(os); -} - -inline std::istream& operator>>(std::istream &is, mpreal& v) -{ - // TODO: use cout::hexfloat and other flags to setup base - std::string tmp; - is >> tmp; - mpfr_set_str(v.mpfr_ptr(), tmp.c_str(), 10, mpreal::get_default_rnd()); - return is; -} - -////////////////////////////////////////////////////////////////////////// -// Bits - decimal digits relation -// bits = ceil(digits*log[2](10)) -// digits = floor(bits*log[10](2)) - -inline mp_prec_t digits2bits(int d) -{ - const double LOG2_10 = 3.3219280948873624; - - return mp_prec_t(std::ceil( d * LOG2_10 )); -} - -inline int bits2digits(mp_prec_t b) -{ - const double LOG10_2 = 0.30102999566398119; - - return int(std::floor( b * LOG10_2 )); -} - -////////////////////////////////////////////////////////////////////////// -// Set/Get number properties -inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode) -{ - mpfr_setsign(mpfr_ptr(), mpfr_srcptr(), sign < 0, RoundingMode); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline int mpreal::getPrecision() const -{ - return int(mpfr_get_prec(mpfr_srcptr())); -} - -inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode) -{ - mpfr_prec_round(mpfr_ptr(), Precision, RoundingMode); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::setInf(int sign) -{ - mpfr_set_inf(mpfr_ptr(), sign); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::setNan() -{ - mpfr_set_nan(mpfr_ptr()); - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mpreal& mpreal::setZero(int sign) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) - mpfr_set_zero(mpfr_ptr(), sign); -#else - mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)()); - setSign(sign); -#endif - - MPREAL_MSVC_DEBUGVIEW_CODE; - return *this; -} - -inline mp_prec_t mpreal::get_prec() const -{ - return mpfr_get_prec(mpfr_srcptr()); -} - -inline void mpreal::set_prec(mp_prec_t prec, mp_rnd_t rnd_mode) -{ - mpfr_prec_round(mpfr_ptr(),prec,rnd_mode); - MPREAL_MSVC_DEBUGVIEW_CODE; -} - -inline mp_exp_t mpreal::get_exp () const -{ - return mpfr_get_exp(mpfr_srcptr()); -} - -inline int mpreal::set_exp (mp_exp_t e) -{ - int x = mpfr_set_exp(mpfr_ptr(), e); - MPREAL_MSVC_DEBUGVIEW_CODE; - return x; -} - -inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd()) -{ - mpreal y(x); -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0)) - mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode); -#else - *exp = mpfr_get_exp(y.mpfr_srcptr()); - mpfr_set_exp(y.mpfr_ptr(),0); -#endif - return y; -} - -inline const mpreal ldexp(const mpreal& v, mp_exp_t exp) -{ - mpreal x(v); - - // rounding is not important since we are just increasing the exponent (= exact operation) - mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd()); - return x; -} - -inline const mpreal scalbn(const mpreal& v, mp_exp_t exp) -{ - return ldexp(v, exp); -} - -inline mpreal machine_epsilon(mp_prec_t prec) -{ - /* the smallest eps such that 1 + eps != 1 */ - return machine_epsilon(mpreal(1, prec)); -} - -inline mpreal machine_epsilon(const mpreal& x) -{ - /* the smallest eps such that x + eps != x */ - if( x < 0) - { - return nextabove(-x) + x; - }else{ - return nextabove( x) - x; - } -} - -// minval is 'safe' meaning 1 / minval does not overflow -inline mpreal minval(mp_prec_t prec) -{ - /* min = 1/2 * 2^emin = 2^(emin - 1) */ - return mpreal(1, prec) << mpreal::get_emin()-1; -} - -// maxval is 'safe' meaning 1 / maxval does not underflow -inline mpreal maxval(mp_prec_t prec) -{ - /* max = (1 - eps) * 2^emax, eps is machine epsilon */ - return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax(); -} - -inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps) -{ - return abs(a - b) <= machine_epsilon((max)(abs(a), abs(b))) * maxUlps; -} - -inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps) -{ - return abs(a - b) <= eps; -} - -inline bool isEqualFuzzy(const mpreal& a, const mpreal& b) -{ - return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b))))); -} - -////////////////////////////////////////////////////////////////////////// -// C++11 sign functions. -inline mpreal copysign(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal rop(0, mpfr_get_prec(x.mpfr_ptr())); - mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode); - return rop; -} - -inline bool signbit(const mpreal& x) -{ - return mpfr_signbit(x.mpfr_srcptr()); -} - -inline mpreal& setsignbit(mpreal& x, bool minus, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpfr_setsign(x.mpfr_ptr(), x.mpfr_srcptr(), minus, rnd_mode); - return x; -} - -inline const mpreal modf(const mpreal& v, mpreal& n) -{ - mpreal f(v); - - // rounding is not important since we are using the same number - mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd()); - mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr()); - return f; -} - -inline int mpreal::check_range (int t, mp_rnd_t rnd_mode) -{ - return mpfr_check_range(mpfr_ptr(),t,rnd_mode); -} - -inline int mpreal::subnormalize (int t,mp_rnd_t rnd_mode) -{ - int r = mpfr_subnormalize(mpfr_ptr(),t,rnd_mode); - MPREAL_MSVC_DEBUGVIEW_CODE; - return r; -} - -inline mp_exp_t mpreal::get_emin (void) -{ - return mpfr_get_emin(); -} - -inline int mpreal::set_emin (mp_exp_t exp) -{ - return mpfr_set_emin(exp); -} - -inline mp_exp_t mpreal::get_emax (void) -{ - return mpfr_get_emax(); -} - -inline int mpreal::set_emax (mp_exp_t exp) -{ - return mpfr_set_emax(exp); -} - -inline mp_exp_t mpreal::get_emin_min (void) -{ - return mpfr_get_emin_min(); -} - -inline mp_exp_t mpreal::get_emin_max (void) -{ - return mpfr_get_emin_max(); -} - -inline mp_exp_t mpreal::get_emax_min (void) -{ - return mpfr_get_emax_min(); -} - -inline mp_exp_t mpreal::get_emax_max (void) -{ - return mpfr_get_emax_max(); -} - -////////////////////////////////////////////////////////////////////////// -// Mathematical Functions -////////////////////////////////////////////////////////////////////////// -#define MPREAL_UNARY_MATH_FUNCTION_BODY(f) \ - mpreal y(0, mpfr_get_prec(x.mpfr_srcptr())); \ - mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r); \ - return y; - -inline const mpreal sqr (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ MPREAL_UNARY_MATH_FUNCTION_BODY(sqr ); } - -inline const mpreal sqrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ MPREAL_UNARY_MATH_FUNCTION_BODY(sqrt); } - -inline const mpreal sqrt(const unsigned long int x, mp_rnd_t r) -{ - mpreal y; - mpfr_sqrt_ui(y.mpfr_ptr(), x, r); - return y; -} - -inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode) -{ - return sqrt(static_cast(v),rnd_mode); -} - -inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode) -{ - if (v>=0) return sqrt(static_cast(v),rnd_mode); - else return mpreal().setNan(); // NaN -} - -inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode) -{ - if (v>=0) return sqrt(static_cast(v),rnd_mode); - else return mpreal().setNan(); // NaN -} - -inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, mpfr_get_prec(x.mpfr_srcptr())); - #if (MPFR_VERSION >= MPFR_VERSION_NUM(4,0,0)) - mpfr_rootn_ui(y.mpfr_ptr(), x.mpfr_srcptr(), k, r); - #else - mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r); - #endif - return y; -} - -inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, mpfr_get_prec(a.mpfr_srcptr())); - mpfr_dim(y.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), r); - return y; -} - -inline int cmpabs(const mpreal& a,const mpreal& b) -{ - return mpfr_cmpabs(a.mpfr_ptr(), b.mpfr_srcptr()); -} - -inline int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - return mpfr_sin_cos(s.mpfr_ptr(), c.mpfr_ptr(), v.mpfr_srcptr(), rnd_mode); -} - -inline const mpreal sqrt (const long double v, mp_rnd_t rnd_mode) { return sqrt(mpreal(v),rnd_mode); } -inline const mpreal sqrt (const double v, mp_rnd_t rnd_mode) { return sqrt(mpreal(v),rnd_mode); } - -inline const mpreal cbrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cbrt ); } -inline const mpreal fabs (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(abs ); } -inline const mpreal abs (const mpreal& x, mp_rnd_t r) { MPREAL_UNARY_MATH_FUNCTION_BODY(abs ); } -inline const mpreal log (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log ); } -inline const mpreal log2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log2 ); } -inline const mpreal log10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log10); } -inline const mpreal exp (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp ); } -inline const mpreal exp2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp2 ); } -inline const mpreal exp10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(exp10); } -inline const mpreal cos (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cos ); } -inline const mpreal sin (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sin ); } -inline const mpreal tan (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(tan ); } -inline const mpreal sec (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sec ); } -inline const mpreal csc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(csc ); } -inline const mpreal cot (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cot ); } -inline const mpreal acos (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(acos ); } -inline const mpreal asin (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(asin ); } -inline const mpreal atan (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(atan ); } - -inline const mpreal logb (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { return log2 (abs(x),r); } - -inline const mpreal acot (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return atan (1/v, r); } -inline const mpreal asec (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return acos (1/v, r); } -inline const mpreal acsc (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return asin (1/v, r); } -inline const mpreal acoth (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return atanh(1/v, r); } -inline const mpreal asech (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return acosh(1/v, r); } -inline const mpreal acsch (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) { return asinh(1/v, r); } - -inline const mpreal cosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(cosh ); } -inline const mpreal sinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sinh ); } -inline const mpreal tanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(tanh ); } -inline const mpreal sech (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(sech ); } -inline const mpreal csch (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(csch ); } -inline const mpreal coth (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(coth ); } -inline const mpreal acosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(acosh); } -inline const mpreal asinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(asinh); } -inline const mpreal atanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(atanh); } - -inline const mpreal log1p (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(log1p ); } -inline const mpreal expm1 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(expm1 ); } -inline const mpreal eint (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(eint ); } -inline const mpreal gamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(gamma ); } -inline const mpreal tgamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(gamma ); } -inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma); } -inline const mpreal zeta (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(zeta ); } -inline const mpreal erf (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(erf ); } -inline const mpreal erfc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(erfc ); } -inline const mpreal besselj0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(j0 ); } -inline const mpreal besselj1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(j1 ); } -inline const mpreal bessely0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(y0 ); } -inline const mpreal bessely1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(y1 ); } - -inline const mpreal nextpow2(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, x.getPrecision()); - - if(!iszero(x)) - y = ceil(log2(abs(x,r),r)); - - return y; -} - -inline const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision())); - mpfr_atan2(a.mpfr_ptr(), y.mpfr_srcptr(), x.mpfr_srcptr(), rnd_mode); - return a; -} - -inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision())); - mpfr_hypot(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode); - return a; -} - -inline const mpreal hypot(const mpreal& a, const mpreal& b, const mpreal& c) -{ - if(isnan EIGEN_NOT_A_MACRO (a) || isnan EIGEN_NOT_A_MACRO (b) || isnan EIGEN_NOT_A_MACRO(c)) return mpreal().setNan(); - else - { - mpreal absa = abs(a), absb = abs(b), absc = abs(c); - mpreal w = (std::max)(absa, (std::max)(absb, absc)); - mpreal r; - - if (!iszero(w)) - { - mpreal iw = 1/w; - r = w * sqrt(sqr(absa*iw) + sqr(absb*iw) + sqr(absc*iw)); - } - - return r; - } -} - -inline const mpreal hypot(const mpreal& a, const mpreal& b, const mpreal& c, const mpreal& d) -{ - if(isnan EIGEN_NOT_A_MACRO (a) || isnan EIGEN_NOT_A_MACRO (b) || isnan EIGEN_NOT_A_MACRO (c) || isnan EIGEN_NOT_A_MACRO (d)) return mpreal().setNan(); - else - { - mpreal absa = abs(a), absb = abs(b), absc = abs(c), absd = abs(d); - mpreal w = (std::max)(absa, (std::max)(absb, (std::max)(absc, absd))); - mpreal r; - - if (!iszero(w)) - { - mpreal iw = 1/w; - r = w * sqrt(sqr(absa*iw) + sqr(absb*iw) + sqr(absc*iw) + sqr(absd*iw)); - } - - return r; - } -} - -inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision())); - mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode); - return a; -} - -inline const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision())); - mpfr_remquo(a.mpfr_ptr(),q, x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode); - return a; -} - -inline const mpreal fac_ui (unsigned long int v, mp_prec_t prec = mpreal::get_default_prec(), - mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(0, prec); - mpfr_fac_ui(x.mpfr_ptr(),v,rnd_mode); - return x; -} - - -inline const mpreal lgamma (const mpreal& v, int *signp = 0, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(v); - int tsignp; - - if(signp) mpfr_lgamma(x.mpfr_ptr(), signp,v.mpfr_srcptr(),rnd_mode); - else mpfr_lgamma(x.mpfr_ptr(),&tsignp,v.mpfr_srcptr(),rnd_mode); - - return x; -} - - -inline const mpreal besseljn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, x.getPrecision()); - mpfr_jn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r); - return y; -} - -inline const mpreal besselyn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal y(0, x.getPrecision()); - mpfr_yn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r); - return y; -} - -inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a; - mp_prec_t p1, p2, p3; - - p1 = v1.get_prec(); - p2 = v2.get_prec(); - p3 = v3.get_prec(); - - a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1)); - - mpfr_fma(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode); - return a; -} - -inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a; - mp_prec_t p1, p2, p3; - - p1 = v1.get_prec(); - p2 = v2.get_prec(); - p3 = v3.get_prec(); - - a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1)); - - mpfr_fms(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode); - return a; -} - -inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a; - mp_prec_t p1, p2; - - p1 = v1.get_prec(); - p2 = v2.get_prec(); - - a.set_prec(p1>p2?p1:p2); - - mpfr_agm(a.mp, v1.mp, v2.mp, rnd_mode); - - return a; -} - -inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd()) -{ - mpfr_srcptr *p = new mpfr_srcptr[n]; - - for (unsigned long int i = 0; i < n; i++) - p[i] = tab[i].mpfr_srcptr(); - - mpreal x; - status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode); - - delete [] p; - return x; -} - -////////////////////////////////////////////////////////////////////////// -// MPFR 2.4.0 Specifics -#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0)) - -inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode); -} - -inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) -{ - MPREAL_UNARY_MATH_FUNCTION_BODY(li2); -} - -inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - /* R = rem(X,Y) if Y != 0, returns X - n * Y where n = trunc(X/Y). */ - return fmod(x, y, rnd_mode); -} - -inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - (void)rnd_mode; - - /* - - m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y) - - The following are true by convention: - - mod(x,0) is x - - mod(x,x) is 0 - - mod(x,y) for x != y and y != 0 has the same sign as y. - - */ - - if(iszero(y)) return x; - if(x == y) return 0; - - mpreal m = x - floor(x / y) * y; - - return copysign(abs(m),y); // make sure result has the same sign as Y -} - -inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal a; - mp_prec_t yp, xp; - - yp = y.get_prec(); - xp = x.get_prec(); - - a.set_prec(yp>xp?yp:xp); - - mpfr_fmod(a.mp, x.mp, y.mp, rnd_mode); - - return a; -} - -inline const mpreal rec_sqrt(const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(v); - mpfr_rec_sqrt(x.mp,v.mp,rnd_mode); - return x; -} -#endif // MPFR 2.4.0 Specifics - -////////////////////////////////////////////////////////////////////////// -// MPFR 3.0.0 Specifics -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) -inline const mpreal digamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(digamma); } -inline const mpreal ai (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(ai); } -#endif // MPFR 3.0.0 Specifics - -////////////////////////////////////////////////////////////////////////// -// Constants -inline const mpreal const_log2 (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal x(0, p); - mpfr_const_log2(x.mpfr_ptr(), r); - return x; -} - -inline const mpreal const_pi (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal x(0, p); - mpfr_const_pi(x.mpfr_ptr(), r); - return x; -} - -inline const mpreal const_euler (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal x(0, p); - mpfr_const_euler(x.mpfr_ptr(), r); - return x; -} - -inline const mpreal const_catalan (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd()) -{ - mpreal x(0, p); - mpfr_const_catalan(x.mpfr_ptr(), r); - return x; -} - -inline const mpreal const_infinity (int sign = 1, mp_prec_t p = mpreal::get_default_prec()) -{ - mpreal x(0, p); - mpfr_set_inf(x.mpfr_ptr(), sign); - return x; -} - -////////////////////////////////////////////////////////////////////////// -// Integer Related Functions -inline const mpreal ceil(const mpreal& v) -{ - mpreal x(v); - mpfr_ceil(x.mp,v.mp); - return x; -} - -inline const mpreal floor(const mpreal& v) -{ - mpreal x(v); - mpfr_floor(x.mp,v.mp); - return x; -} - -inline const mpreal round(const mpreal& v) -{ - mpreal x(v); - mpfr_round(x.mp,v.mp); - return x; -} - -inline const mpreal trunc(const mpreal& v) -{ - mpreal x(v); - mpfr_trunc(x.mp,v.mp); - return x; -} - -inline const mpreal rint (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint ); } -inline const mpreal rint_ceil (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_ceil ); } -inline const mpreal rint_floor (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_floor); } -inline const mpreal rint_round (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_round); } -inline const mpreal rint_trunc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(rint_trunc); } -inline const mpreal frac (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) { MPREAL_UNARY_MATH_FUNCTION_BODY(frac ); } - -////////////////////////////////////////////////////////////////////////// -// Miscellaneous Functions -inline int sgn(const mpreal& op) -{ - // Please note, this is classic signum function which ignores sign of zero. - // Use signbit if you need sign of zero. - return mpfr_sgn(op.mpfr_srcptr()); -} - -////////////////////////////////////////////////////////////////////////// -// Miscellaneous Functions -inline void swap (mpreal& a, mpreal& b) { mpfr_swap(a.mpfr_ptr(),b.mpfr_ptr()); } -inline const mpreal (max)(const mpreal& x, const mpreal& y){ return (x>y?x:y); } -inline const mpreal (min)(const mpreal& x, const mpreal& y){ return (x= MPFR_VERSION_NUM(3,0,0)) -inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x; - mpfr_urandom(x.mpfr_ptr(), state, rnd_mode); - return x; -} -#endif - -#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2)) -inline const mpreal random2 (mp_size_t size, mp_exp_t exp) -{ - mpreal x; - mpfr_random2(x.mpfr_ptr(),size,exp); - return x; -} -#endif - -// Uniformly distributed random number generation -// a = random(seed); <- initialization & first random number generation -// a = random(); <- next random numbers generation -// seed != 0 -inline const mpreal random(unsigned int seed = 0) -{ -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0)) - static gmp_randstate_t state; - static bool initialize = true; - - if(initialize) - { - gmp_randinit_default(state); - gmp_randseed_ui(state,0); - initialize = false; - } - - if(seed != 0) gmp_randseed_ui(state,seed); - - return mpfr::urandom(state); -#else - if(seed != 0) std::srand(seed); - return mpfr::mpreal(std::rand()/(double)RAND_MAX); -#endif - -} - -#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0) && MPFR_VERSION < MPFR_VERSION_NUM(4,0,0)) - -// TODO: -// Use mpfr_nrandom since mpfr_grandom is deprecated -#if defined(_MSC_VER) -#pragma warning( push ) -#pragma warning( disable : 1478) -#endif -inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x; - mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode); - return x; -} -#if defined(_MSC_VER) -#pragma warning( pop ) -#endif - -inline const mpreal grandom(unsigned int seed = 0) -{ - static gmp_randstate_t state; - static bool initialize = true; - - if(initialize) - { - gmp_randinit_default(state); - gmp_randseed_ui(state,0); - initialize = false; - } - - if(seed != 0) gmp_randseed_ui(state,seed); - - return mpfr::grandom(state); -} -#endif - -////////////////////////////////////////////////////////////////////////// -// Set/Get global properties -inline void mpreal::set_default_prec(mp_prec_t prec) -{ - mpfr_set_default_prec(prec); -} - -inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode) -{ - mpfr_set_default_rounding_mode(rnd_mode); -} - -inline bool mpreal::fits_in_bits(double x, int n) -{ - int i; - double t; - return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0); -} - -inline const mpreal pow(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_pow(x.mp,x.mp,b.mp,rnd_mode); - return x; -} - -inline const mpreal pow(const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_pow_z(x.mp,x.mp,b,rnd_mode); - return x; -} - -inline const mpreal pow(const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_pow_ui(x.mp,x.mp,b,rnd_mode); - return x; -} - -inline const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(a,static_cast(b),rnd_mode); -} - -inline const mpreal pow(const mpreal& a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_pow_si(x.mp,x.mp,b,rnd_mode); - return x; -} - -inline const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode) -{ - return pow(a,static_cast(b),rnd_mode); -} - -inline const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode) -{ - return pow(a,mpreal(b),rnd_mode); -} - -inline const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode) -{ - return pow(a,mpreal(b),rnd_mode); -} - -inline const mpreal pow(const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()) -{ - mpreal x(a); - mpfr_ui_pow(x.mp,a,b.mp,rnd_mode); - return x; -} - -inline const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),b,rnd_mode); -} - -inline const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),b,rnd_mode); - else return pow(mpreal(a),b,rnd_mode); -} - -inline const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),b,rnd_mode); - else return pow(mpreal(a),b,rnd_mode); -} - -inline const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); -} - -inline const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); -} - -// pow unsigned long int -inline const mpreal pow(const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - mpreal x(a); - mpfr_ui_pow_ui(x.mp,a,b,rnd_mode); - return x; -} - -inline const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(a,static_cast(b),rnd_mode); //mpfr_ui_pow_ui -} - -inline const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode) -{ - if(b>0) return pow(a,static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode) -{ - if(b>0) return pow(a,static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode) -{ - return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode) -{ - return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow -} - -// pow unsigned int -inline const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),b,rnd_mode); //mpfr_ui_pow_ui -} - -inline const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui -} - -inline const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode) -{ - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode) -{ - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow -} - -inline const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode) -{ - return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow -} - -// pow long int -inline const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - if (a>0) return pow(static_cast(a),b,rnd_mode); //mpfr_ui_pow_ui - else return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode) -{ - if (a>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(mpreal(a),static_cast(b),rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode) -{ - if (a>0) - { - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - }else{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si - } -} - -inline const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode) -{ - if (a>0) - { - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - }else{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_si - } -} - -inline const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow -} - -inline const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow -} - -// pow int -inline const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - if (a>0) return pow(static_cast(a),b,rnd_mode); //mpfr_ui_pow_ui - else return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode) -{ - if (a>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(mpreal(a),static_cast(b),rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode) -{ - if (a>0) - { - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - }else{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si - } -} - -inline const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode) -{ - if (a>0) - { - if(b>0) return pow(static_cast(a),static_cast(b),rnd_mode); //mpfr_ui_pow_ui - else return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - }else{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_si - } -} - -inline const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow -} - -inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode) -{ - if (a>=0) return pow(static_cast(a),mpreal(b),rnd_mode); //mpfr_ui_pow - else return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow -} - -// pow long double -inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),mpreal(b),rnd_mode); -} - -inline const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),static_cast(b),rnd_mode); //mpfr_pow_ui -} - -inline const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si -} - -inline const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_si -} - -inline const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),mpreal(b),rnd_mode); -} - -inline const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_ui -} - -inline const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_ui -} - -inline const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si -} - -inline const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode) -{ - return pow(mpreal(a),static_cast(b),rnd_mode); // mpfr_pow_si -} -} // End of mpfr namespace - -// Explicit specialization of std::swap for mpreal numbers -// Thus standard algorithms will use efficient version of swap (due to Koenig lookup) -// Non-throwing swap C++ idiom: http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Non-throwing_swap -namespace std -{ - // we are allowed to extend namespace std with specializations only - template <> - inline void swap(mpfr::mpreal& x, mpfr::mpreal& y) - { - return mpfr::swap(x, y); - } - - template<> - class numeric_limits - { - public: - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const int radix = 2; - - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - - static const bool is_iec559 = true; // = IEEE 754 - static const bool is_bounded = true; - static const bool is_modulo = false; - static const bool traps = true; - static const bool tinyness_before = true; - - static const float_denorm_style has_denorm = denorm_absent; - - inline static mpfr::mpreal (min) (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::minval(precision); } - inline static mpfr::mpreal (max) (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(precision); } - inline static mpfr::mpreal lowest (mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(precision); } - - // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon) - inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) { return mpfr::machine_epsilon(precision); } - - // Returns smallest eps such that x + eps != x (relative machine epsilon) - inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) { return mpfr::machine_epsilon(x); } - - inline static mpfr::mpreal round_error(mp_prec_t precision = mpfr::mpreal::get_default_prec()) - { - mp_rnd_t r = mpfr::mpreal::get_default_rnd(); - - if(r == GMP_RNDN) return mpfr::mpreal(0.5, precision); - else return mpfr::mpreal(1.0, precision); - } - - inline static const mpfr::mpreal infinity() { return mpfr::const_infinity(); } - inline static const mpfr::mpreal quiet_NaN() { return mpfr::mpreal().setNan(); } - inline static const mpfr::mpreal signaling_NaN() { return mpfr::mpreal().setNan(); } - inline static const mpfr::mpreal denorm_min() { return (min)(); } - - // Please note, exponent range is not fixed in MPFR - static const int min_exponent = MPFR_EMIN_DEFAULT; - static const int max_exponent = MPFR_EMAX_DEFAULT; - MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811); - MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811); - -#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS - - // Following members should be constant according to standard, but they can be variable in MPFR - // So we define them as functions here. - // - // This is preferable way for std::numeric_limits specialization. - // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost. - // See below for compatible implementation. - inline static float_round_style round_style() - { - mp_rnd_t r = mpfr::mpreal::get_default_rnd(); - - switch (r) - { - case GMP_RNDN: return round_to_nearest; - case GMP_RNDZ: return round_toward_zero; - case GMP_RNDU: return round_toward_infinity; - case GMP_RNDD: return round_toward_neg_infinity; - default: return round_indeterminate; - } - } - - inline static int digits() { return int(mpfr::mpreal::get_default_prec()); } - inline static int digits(const mpfr::mpreal& x) { return x.getPrecision(); } - - inline static int digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec()) - { - return mpfr::bits2digits(precision); - } - - inline static int digits10(const mpfr::mpreal& x) - { - return mpfr::bits2digits(x.getPrecision()); - } - - inline static int max_digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec()) - { - return digits10(precision); - } -#else - // Digits and round_style are NOT constants when it comes to mpreal. - // If possible, please use functions digits() and round_style() defined above. - // - // These (default) values are preserved for compatibility with existing libraries, e.g. boost. - // Change them accordingly to your application. - // - // For example, if you use 256 bits of precision uniformly in your program, then: - // digits = 256 - // digits10 = 77 - // max_digits10 = 78 - // - // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details. - - static const std::float_round_style round_style = round_to_nearest; - static const int digits = 53; - static const int digits10 = 15; - static const int max_digits10 = 16; -#endif - }; - -} - -#endif /* __MPREAL_H__ */ diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp index 4a25e993c0860fce08750a7489cc3f1de1a5e0a8..10beb0714a4d826ffc66e74f2d1701053bd7b21c 100644 --- a/unsupported/test/mpreal_support.cpp +++ b/unsupported/test/mpreal_support.cpp @@ -1,3 +1,4 @@ +#include // Must be included before main.h. #include "main.h" #include #include diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp index cdfd10ca45901f0d3ef32b7c3b9c06c322068f2c..602c2cb84de7e8d1df92586ee1c459ce727a30ee 100644 --- a/unsupported/test/sparse_extra.cpp +++ b/unsupported/test/sparse_extra.cpp @@ -31,6 +31,22 @@ static long g_dense_op_sparse_count = 0; #include "sparse_basic.cpp" #endif +#if EIGEN_HAS_CXX11 + +#ifdef min +#undef min +#endif + +#ifdef max +#undef max +#endif + +#include +#define EIGEN_UNORDERED_MAP_SUPPORT + +#endif + + #include template @@ -146,6 +162,7 @@ template void sparse_extra(const SparseMatrixType& re } + template void check_marketio() { diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp index 589bb76e10f835f6333ded87b5ed6de0d7b405a0..44c77535e61471ffc72ce0fec4c58514746a6b41 100644 --- a/unsupported/test/special_functions.cpp +++ b/unsupported/test/special_functions.cpp @@ -171,9 +171,9 @@ template void array_special_functions() // Check the ndtri function against scipy.special.ndtri { - ArrayType x(7), res(7), ref(7); - x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01; - ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408; + ArrayType x(11), res(11), ref(11); + x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01, 0, 1, -0.01, 1.01; + ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408, -plusinf, plusinf, nan, nan; CALL_SUBTEST( verify_component_wise(ref, ref); ); CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); ); CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); ); @@ -191,10 +191,10 @@ template void array_special_functions() // Check the zeta function against scipy.special.zeta { - ArrayType x(10), q(10), res(10), ref(10); - x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9, 2, 3, 4; - q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3; - ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf; + ArrayType x(11), q(11), res(11), ref(11); + x << 1.5, 4, 10.5, 10000.5, 3, 1, 0.9, 2, 3, 4, 2000; + q << 2, 1.5, 3, 1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3, 2000; + ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf, 0; CALL_SUBTEST( verify_component_wise(ref, ref); ); CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); ); CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
OuterStarts:035810\em 12