diff --git a/.gitignore b/.gitignore
index f6ab76fdadf4d5f19c6b124909175d3f509b4c33..19dfac9ef6a76a91edd2844e09751a15524e1691 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,3 +36,6 @@ lapack/reference
 .settings
 Makefile
 !ci/build.gitlab-ci.yml
+!scripts/buildtests.in
+!Eigen/Core
+!Eigen/src/Core
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7ba7d38721875ff3ce7001e22babd2028f414cf0..ac8042b18900370e5d563a1f9a9716049ec89e9f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,16 @@ cmake_minimum_required(VERSION 3.5.0)
 
 project(Eigen3)
 
+# Remove this block after bumping CMake to v3.21.0
+# PROJECT_IS_TOP_LEVEL is defined then by default
+if(CMAKE_VERSION VERSION_LESS 3.21.0)
+  if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+    set(PROJECT_IS_TOP_LEVEL TRUE)
+  else()
+    set(PROJECT_IS_TOP_LEVEL FALSE)
+  endif()
+endif()
+
 # guard against in-source builds
 
 if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
@@ -23,7 +33,7 @@ endif()
 
 
 #############################################################################
-# retrieve version information                                               #
+# retrieve version information                                              #
 #############################################################################
 
 # automatically parse the version number
@@ -88,6 +98,23 @@ else()
   ei_add_cxx_compiler_flag("-std=c++03")
 endif()
 
+function(ei_maybe_separate_arguments variable mode args)
+  # Use separate_arguments if the input is a single string containing a space.
+  # Otherwise, if it is already a list or doesn't have a space, just propagate
+  # the original value.  This is to better support multi-argument lists.
+  list(LENGTH args list_length)
+  if (${list_length} EQUAL 1)
+    string(FIND "${args}" " " has_space)
+    if (${has_space} GREATER -1)
+      separate_arguments(args ${mode} "${args}")
+    endif()
+  endif()
+  set(${variable} ${args} PARENT_SCOPE)
+endfunction(ei_maybe_separate_arguments)
+
+# Determine if we should build shared libraries on this platform.
+get_cmake_property(EIGEN_BUILD_SHARED_LIBS TARGET_SUPPORTS_SHARED_LIBS)
+
 #############################################################################
 # find how to link to the standard libraries                                #
 #############################################################################
@@ -98,6 +125,10 @@ find_package(StandardMathLibrary)
 set(EIGEN_TEST_CUSTOM_LINKER_FLAGS  "" CACHE STRING "Additional linker flags when linking unit tests.")
 set(EIGEN_TEST_CUSTOM_CXX_FLAGS     "" CACHE STRING "Additional compiler flags when compiling unit tests.")
 
+# Convert space-separated arguments into CMake lists for downstream consumption.
+ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_LINKER_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_LINKER_FLAGS}")
+ei_maybe_separate_arguments(EIGEN_TEST_CUSTOM_CXX_FLAGS NATIVE_COMMAND "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
+
 set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "")
 
 if(NOT STANDARD_MATH_LIBRARY_FOUND)
@@ -106,13 +137,11 @@ if(NOT STANDARD_MATH_LIBRARY_FOUND)
     "Can't link to the standard math library. Please report to the Eigen developers, telling them about your platform.")
 
 else()
-
   if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
     set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO} ${STANDARD_MATH_LIBRARY}")
   else()
     set(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO "${STANDARD_MATH_LIBRARY}")
   endif()
-
 endif()
 
 if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
@@ -122,6 +151,7 @@ else()
 endif()
 
 option(EIGEN_BUILD_BTL "Build benchmark suite" OFF)
+option(EIGEN_BUILD_SPBENCH "Build sparse benchmark suite" OFF)
 
 # Disable pkgconfig only for native Windows builds
 if(NOT WIN32 OR NOT CMAKE_HOST_SYSTEM_NAME MATCHES Windows)
@@ -250,18 +280,12 @@ if(NOT MSVC)
   option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
   if(EIGEN_TEST_AVX512)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512f -mfma")
-    if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
-    endif()
     message(STATUS "Enabling AVX512 in tests/examples")
   endif()
 
   option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF)
   if(EIGEN_TEST_AVX512DQ)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq")
-    if (NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fabi-version=6")
-    endif()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx512dq -mfma")
     message(STATUS "Enabling AVX512DQ in tests/examples")
   endif()
 
@@ -366,11 +390,19 @@ else()
   endif()
 
   option(EIGEN_TEST_FMA "Enable/Disable FMA/AVX2 in tests/examples" OFF)
-  if(EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON)
+  option(EIGEN_TEST_AVX2 "Enable/Disable FMA/AVX2 in tests/examples" OFF)
+  if((EIGEN_TEST_FMA AND NOT EIGEN_TEST_NEON) OR EIGEN_TEST_AVX2)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
     message(STATUS "Enabling FMA/AVX2 in tests/examples")
   endif()
 
+  option(EIGEN_TEST_AVX512 "Enable/Disable AVX512 in tests/examples" OFF)
+  option(EIGEN_TEST_AVX512DQ "Enable/Disable AVX512DQ in tests/examples" OFF)
+  if(EIGEN_TEST_AVX512 OR EIGEN_TEST_AVX512DQ)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512")
+    message(STATUS "Enabling AVX512 in tests/examples")
+  endif()
+
 endif()
 
 option(EIGEN_TEST_NO_EXPLICIT_VECTORIZATION "Disable explicit vectorization in tests/examples" OFF)
@@ -413,7 +445,8 @@ if(EIGEN_TEST_NO_EXCEPTIONS)
   message(STATUS "Disabling exceptions in tests/examples")
 endif()
 
-set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture level to target when compiling CUDA code")
+set(EIGEN_CUDA_CXX_FLAGS "" CACHE STRING "Additional flags to pass to the cuda compiler.")
+set(EIGEN_CUDA_COMPUTE_ARCH 30 CACHE STRING "The CUDA compute architecture(s) to target when compiling CUDA code")
 
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 
@@ -424,25 +457,26 @@ endif()
 
 if(EIGEN_INCLUDE_INSTALL_DIR AND NOT INCLUDE_INSTALL_DIR)
   set(INCLUDE_INSTALL_DIR ${EIGEN_INCLUDE_INSTALL_DIR}
-      CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed")
+      CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed")
 else()
   set(INCLUDE_INSTALL_DIR
       "${CMAKE_INSTALL_INCLUDEDIR}/eigen3"
-      CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen header files are installed"
+      CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen header files are installed"
       )
 endif()
 set(CMAKEPACKAGE_INSTALL_DIR
     "${CMAKE_INSTALL_DATADIR}/eigen3/cmake"
-    CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where Eigen3Config.cmake is installed"
+    CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where Eigen3Config.cmake is installed"
     )
 set(PKGCONFIG_INSTALL_DIR
     "${CMAKE_INSTALL_DATADIR}/pkgconfig"
-    CACHE STRING "The directory relative to CMAKE_PREFIX_PATH where eigen3.pc is installed"
+    CACHE PATH "The directory relative to CMAKE_INSTALL_PREFIX where eigen3.pc is installed"
     )
 
 foreach(var INCLUDE_INSTALL_DIR CMAKEPACKAGE_INSTALL_DIR PKGCONFIG_INSTALL_DIR)
+  # If an absolute path is specified, make it relative to "{CMAKE_INSTALL_PREFIX}".
   if(IS_ABSOLUTE "${${var}}")
-    message(FATAL_ERROR "${var} must be relative to CMAKE_PREFIX_PATH. Got: ${${var}}")
+    file(RELATIVE_PATH "${var}" "${CMAKE_INSTALL_PREFIX}" "${${var}}")
   endif()
 endforeach()
 
@@ -478,8 +512,9 @@ if(EIGEN_BUILD_DOC)
 endif()
 
 
-option(BUILD_TESTING "Enable creation of Eigen tests." ON)
-if(BUILD_TESTING)
+cmake_dependent_option(BUILD_TESTING "Enable creation of tests." ON "PROJECT_IS_TOP_LEVEL" OFF)
+option(EIGEN_BUILD_TESTING "Enable creation of Eigen tests." ${BUILD_TESTING})
+if(EIGEN_BUILD_TESTING)
   include(EigenConfigureTesting)
 
   if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
@@ -491,6 +526,9 @@ if(BUILD_TESTING)
   add_subdirectory(failtest)
 endif()
 
+include(CMakeDetermineFortranCompiler)
+option(EIGEN_BUILD_BLAS "Toggles the building of the Eigen Blas library" ${CMAKE_Fortran_COMPILER})
+option(EIGEN_BUILD_LAPACK "Toggles the building of the included Eigen LAPACK library" ${CMAKE_Fortran_COMPILER})
 if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
   add_subdirectory(blas)
   add_subdirectory(lapack)
@@ -541,13 +579,13 @@ if(EIGEN_BUILD_BTL)
   add_subdirectory(bench/btl EXCLUDE_FROM_ALL)
 endif()
 
-if(NOT WIN32)
+if(NOT WIN32 AND EIGEN_BUILD_SPBENCH)
   add_subdirectory(bench/spbench EXCLUDE_FROM_ALL)
 endif()
 
 configure_file(scripts/cdashtesting.cmake.in cdashtesting.cmake @ONLY)
 
-if(BUILD_TESTING)
+if(EIGEN_BUILD_TESTING)
   ei_testing_print_summary()
 endif()
 
@@ -555,34 +593,35 @@ message(STATUS "")
 message(STATUS "Configured Eigen ${EIGEN_VERSION_NUMBER}")
 message(STATUS "")
 
-string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower)
-if(cmake_generator_tolower MATCHES "makefile")
-  message(STATUS "Available targets (use: make TARGET):")
-else()
-  message(STATUS "Available targets (use: cmake --build . --target TARGET):")
-endif()
-message(STATUS "---------+--------------------------------------------------------------")
-message(STATUS "Target   |   Description")
-message(STATUS "---------+--------------------------------------------------------------")
-message(STATUS "install  | Install Eigen. Headers will be installed to:")
-message(STATUS "         |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
-message(STATUS "         |   Using the following values:")
-message(STATUS "         |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
-message(STATUS "         |     INCLUDE_INSTALL_DIR:  ${INCLUDE_INSTALL_DIR}")
-message(STATUS "         |   Change the install location of Eigen headers using:")
-message(STATUS "         |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
-message(STATUS "         |   Or:")
-message(STATUS "         |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
-message(STATUS "doc      | Generate the API documentation, requires Doxygen & LaTeX")
-if(BUILD_TESTING)
-  message(STATUS "check    | Build and run the unit-tests. Read this page:")
-  message(STATUS "         |   http://eigen.tuxfamily.org/index.php?title=Tests")
+if(PROJECT_IS_TOP_LEVEL)
+  string(TOLOWER "${CMAKE_GENERATOR}" cmake_generator_tolower)
+  if(cmake_generator_tolower MATCHES "makefile")
+    message(STATUS "Available targets (use: make TARGET):")
+  else()
+    message(STATUS "Available targets (use: cmake --build . --target TARGET):")
+  endif()
+  message(STATUS "---------+--------------------------------------------------------------")
+  message(STATUS "Target   |   Description")
+  message(STATUS "---------+--------------------------------------------------------------")
+  message(STATUS "install  | Install Eigen. Headers will be installed to:")
+  message(STATUS "         |     <CMAKE_INSTALL_PREFIX>/<INCLUDE_INSTALL_DIR>")
+  message(STATUS "         |   Using the following values:")
+  message(STATUS "         |     CMAKE_INSTALL_PREFIX: ${CMAKE_INSTALL_PREFIX}")
+  message(STATUS "         |     INCLUDE_INSTALL_DIR:  ${INCLUDE_INSTALL_DIR}")
+  message(STATUS "         |   Change the install location of Eigen headers using:")
+  message(STATUS "         |     cmake . -DCMAKE_INSTALL_PREFIX=yourprefix")
+  message(STATUS "         |   Or:")
+  message(STATUS "         |     cmake . -DINCLUDE_INSTALL_DIR=yourdir")
+  message(STATUS "doc      | Generate the API documentation, requires Doxygen & LaTeX")
+  if(EIGEN_BUILD_TESTING)
+    message(STATUS "check    | Build and run the unit-tests. Read this page:")
+    message(STATUS "         |   http://eigen.tuxfamily.org/index.php?title=Tests")
+  endif()
+  message(STATUS "blas     | Build BLAS library (not the same thing as Eigen)")
+  message(STATUS "uninstall| Remove files installed by the install target")
+  message(STATUS "---------+--------------------------------------------------------------")
+  message(STATUS "")
 endif()
-message(STATUS "blas     | Build BLAS library (not the same thing as Eigen)")
-message(STATUS "uninstall| Remove files installed by the install target")
-message(STATUS "---------+--------------------------------------------------------------")
-message(STATUS "")
-
 
 set ( EIGEN_VERSION_STRING ${EIGEN_VERSION_NUMBER} )
 set ( EIGEN_VERSION_MAJOR  ${EIGEN_WORLD_VERSION} )
@@ -608,6 +647,8 @@ set_target_properties (eigen PROPERTIES EXPORT_NAME Eigen)
 
 install (TARGETS eigen EXPORT Eigen3Targets)
 
+option(EIGEN_BUILD_CMAKE_PACKAGE "Enables the creation of EigenConfig.cmake and related files" ON)
+if(EIGEN_BUILD_CMAKE_PACKAGE)
 configure_package_config_file (
   ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Eigen3Config.cmake.in
   ${CMAKE_CURRENT_BINARY_DIR}/Eigen3Config.cmake
@@ -643,6 +684,7 @@ install ( FILES ${CMAKE_CURRENT_SOURCE_DIR}/cmake/UseEigen3.cmake
 # Add uninstall target
 add_custom_target ( uninstall
     COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmake/EigenUninstall.cmake)
+endif()
 
 if (EIGEN_SPLIT_TESTSUITE)
   ei_split_testsuite("${EIGEN_SPLIT_TESTSUITE}")
diff --git a/Eigen/CholmodSupport b/Eigen/CholmodSupport
index bed8924d31e0b3f46713cf74ba3deb6a63f9f590..1037bd55de367eaae9cc2484d39acc2146f3cadb 100644
--- a/Eigen/CholmodSupport
+++ b/Eigen/CholmodSupport
@@ -22,7 +22,7 @@ extern "C" {
   * This module provides an interface to the Cholmod library which is part of the <a href="http://www.suitesparse.com">suitesparse</a> package.
   * It provides the two following main factorization classes:
   * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
-  * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
+  * - class CholmodDecomposition: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
   *
   * For the sake of completeness, this module also propose the two following classes:
   * - class CholmodSimplicialLLT
diff --git a/Eigen/Core b/Eigen/Core
index 5921e15f9df46319ceb5436296b3271ca4dc9e65..1e53ba49b5ecfa9bb7d0bdeeb34648a3f6772123 100644
--- a/Eigen/Core
+++ b/Eigen/Core
@@ -83,8 +83,8 @@
 #include <cmath>
 #include <cassert>
 #include <functional>
-#include <sstream>
 #ifndef EIGEN_NO_IO
+  #include <sstream>
   #include <iosfwd>
 #endif
 #include <cstring>
@@ -109,7 +109,8 @@
 #endif
 
 // required for __cpuid, needs to be included after cmath
-#if EIGEN_COMP_MSVC && EIGEN_ARCH_i386_OR_x86_64 && !EIGEN_OS_WINCE
+// also required for _BitScanReverse on Windows on ARM
+#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE
   #include <intrin.h>
 #endif
 
diff --git a/Eigen/LU b/Eigen/LU
index 0fb184bcb84df406c281730421ba3051e03e8c1c..1236ceb04676f5e180b589f8ca70c3d9362710cf 100644
--- a/Eigen/LU
+++ b/Eigen/LU
@@ -38,9 +38,7 @@
 #include "src/LU/Determinant.h"
 #include "src/LU/InverseImpl.h"
 
-// Use the SSE optimized version whenever possible. At the moment the
-// SSE version doesn't compile when AVX is enabled
-#if (defined EIGEN_VECTORIZE_SSE && !defined EIGEN_VECTORIZE_AVX) || defined EIGEN_VECTORIZE_NEON
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
   #include "src/LU/arch/InverseSize4.h"
 #endif
 
diff --git a/Eigen/SparseLU b/Eigen/SparseLU
index 37c4a5c5a8b305add93e2aae0eb7eac624f3ce68..047cf0dca94923dd820733d6a549c8db3ee4a1af 100644
--- a/Eigen/SparseLU
+++ b/Eigen/SparseLU
@@ -25,8 +25,6 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-#include "src/SparseLU/SparseLU_gemm_kernel.h"
-
 #include "src/SparseLU/SparseLU_Structs.h"
 #include "src/SparseLU/SparseLU_SupernodalMatrix.h"
 #include "src/SparseLU/SparseLUImpl.h"
diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h
index ab2ebf37e6c72cfee6b09c109cfb2aacc3ba0f14..7d76f0c256fd6207819921680217b30697e81ca3 100644
--- a/Eigen/src/Core/AssignEvaluator.h
+++ b/Eigen/src/Core/AssignEvaluator.h
@@ -591,7 +591,7 @@ struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling>
 
     enum { innerSize = DstXprType::InnerSizeAtCompileTime,
            packetSize =unpacket_traits<PacketType>::size,
-           vectorizableSize = (innerSize/packetSize)*packetSize,
+           vectorizableSize = (int(innerSize) / int(packetSize)) * int(packetSize),
            size = DstXprType::SizeAtCompileTime };
 
     for(Index outer = 0; outer < kernel.outerSize(); ++outer)
@@ -785,6 +785,16 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType
   dense_assignment_loop<Kernel>::run(kernel);
 }
 
+// Specialization for filling the destination with a constant value.
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template<typename DstXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const Eigen::CwiseNullaryOp<Eigen::internal::scalar_constant_op<typename DstXprType::Scalar>, DstXprType>& src, const internal::assign_op<typename DstXprType::Scalar,typename DstXprType::Scalar>& func)
+{
+  resize_if_allowed(dst, src, func);
+  std::fill_n(dst.data(), dst.size(), src.functor()());
+}
+#endif
+
 template<typename DstXprType, typename SrcXprType>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src)
 {
diff --git a/Eigen/src/Core/BandMatrix.h b/Eigen/src/Core/BandMatrix.h
index 480e0449556319bec2d73773a2c65a82ea4d8aa9..878c0240ac1699abd0b05e2d34cea9fa4929fe72 100644
--- a/Eigen/src/Core/BandMatrix.h
+++ b/Eigen/src/Core/BandMatrix.h
@@ -67,7 +67,7 @@ class BandMatrixBase : public EigenBase<Derived>
       * \warning the internal storage must be column major. */
     inline Block<CoefficientsType,Dynamic,1> col(Index i)
     {
-      EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+      EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
       Index start = 0;
       Index len = coeffs().rows();
       if (i<=supers())
@@ -90,7 +90,7 @@ class BandMatrixBase : public EigenBase<Derived>
 
     template<int Index> struct DiagonalIntReturnType {
       enum {
-        ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)),
+        ReturnOpposite = (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)),
         Conjugate = ReturnOpposite && NumTraits<Scalar>::IsComplex,
         ActualIndex = ReturnOpposite ? -Index : Index,
         DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic)
@@ -192,7 +192,7 @@ struct traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
     Options = _Options,
     DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
   };
-  typedef Matrix<Scalar,DataRowsAtCompileTime,ColsAtCompileTime,Options&RowMajor?RowMajor:ColMajor> CoefficientsType;
+  typedef Matrix<Scalar, DataRowsAtCompileTime, ColsAtCompileTime, int(Options) & int(RowMajor) ? RowMajor : ColMajor> CoefficientsType;
 };
 
 template<typename _Scalar, int Rows, int Cols, int Supers, int Subs, int Options>
diff --git a/Eigen/src/Core/Block.h b/Eigen/src/Core/Block.h
index 3206d6633e62c29f4466155f47725511a34ba640..9d89b60cf8f0d4b4740fad1f0bf41f5fd8a1bffd 100644
--- a/Eigen/src/Core/Block.h
+++ b/Eigen/src/Core/Block.h
@@ -260,19 +260,19 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     }
 
     template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const
     {
       return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
     }
 
     template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
     {
       m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
     }
 
     template<int LoadMode>
-    inline PacketScalar packet(Index index) const
+    EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const
     {
       return m_xpr.template packet<Unaligned>
               (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
@@ -280,7 +280,7 @@ template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool H
     }
 
     template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
+    EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val)
     {
       m_xpr.template writePacket<Unaligned>
          (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
@@ -334,6 +334,17 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     enum {
       XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0
     };
+
+    /** \internal Returns base+offset (unless base is null, in which case returns null).
+      * Adding an offset to nullptr is undefined behavior, so we must avoid it.
+      */
+    template <typename Scalar>
+    EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE
+    static Scalar* add_to_nullable_pointer(Scalar* base, Index offset)
+    {
+      return base != NULL ? base+offset : NULL;
+    }
+
   public:
 
     typedef MapBase<BlockType> Base;
@@ -344,8 +355,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
       */
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     BlockImpl_dense(XprType& xpr, Index i)
-      : Base(xpr.data() + i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
-                                || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride()),
+      : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(),
+                 i * (    ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor))
+                       || ((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && ( XprTypeIsRowMajor)) ? xpr.innerStride() : xpr.outerStride())),
              BlockRows==1 ? 1 : xpr.rows(),
              BlockCols==1 ? 1 : xpr.cols()),
         m_xpr(xpr),
@@ -359,7 +371,8 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
       */
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+      : Base((BlockRows == 0 || BlockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(),
+                 xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol))),
         m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();
@@ -371,7 +384,9 @@ class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
     BlockImpl_dense(XprType& xpr,
           Index startRow, Index startCol,
           Index blockRows, Index blockCols)
-      : Base(xpr.data()+xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol), blockRows, blockCols),
+      : Base((blockRows == 0 || blockCols == 0) ? NULL : add_to_nullable_pointer(xpr.data(),
+                 xpr.innerStride()*(XprTypeIsRowMajor?startCol:startRow) + xpr.outerStride()*(XprTypeIsRowMajor?startRow:startCol)),
+             blockRows, blockCols),
         m_xpr(xpr), m_startRow(startRow), m_startCol(startCol)
     {
       init();
diff --git a/Eigen/src/Core/BooleanRedux.h b/Eigen/src/Core/BooleanRedux.h
index e32c4ac5b17415ff70fdb15ff1dbbe8cd3a7c760..fa4d7c33114f2112dc199dcd32a9307e57eb842f 100644
--- a/Eigen/src/Core/BooleanRedux.h
+++ b/Eigen/src/Core/BooleanRedux.h
@@ -14,54 +14,56 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount, int InnerSize>
 struct all_unroller
 {
   enum {
-    col = (UnrollCount-1) / Rows,
-    row = (UnrollCount-1) % Rows
+    IsRowMajor = (int(Derived::Flags) & int(RowMajor)),
+    i = (UnrollCount-1) / InnerSize,
+    j = (UnrollCount-1) % InnerSize
   };
 
   EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
   {
-    return all_unroller<Derived, UnrollCount-1, Rows>::run(mat) && mat.coeff(row, col);
+    return all_unroller<Derived, UnrollCount-1, InnerSize>::run(mat) && mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i);
   }
 };
 
-template<typename Derived, int Rows>
-struct all_unroller<Derived, 0, Rows>
+template<typename Derived, int InnerSize>
+struct all_unroller<Derived, 0, InnerSize>
 {
   EIGEN_DEVICE_FUNC static inline bool run(const Derived &/*mat*/) { return true; }
 };
 
-template<typename Derived, int Rows>
-struct all_unroller<Derived, Dynamic, Rows>
+template<typename Derived, int InnerSize>
+struct all_unroller<Derived, Dynamic, InnerSize>
 {
   EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
 
-template<typename Derived, int UnrollCount, int Rows>
+template<typename Derived, int UnrollCount, int InnerSize>
 struct any_unroller
 {
   enum {
-    col = (UnrollCount-1) / Rows,
-    row = (UnrollCount-1) % Rows
+    IsRowMajor = (int(Derived::Flags) & int(RowMajor)),
+    i = (UnrollCount-1) / InnerSize,
+    j = (UnrollCount-1) % InnerSize
   };
-  
+
   EIGEN_DEVICE_FUNC static inline bool run(const Derived &mat)
   {
-    return any_unroller<Derived, UnrollCount-1, Rows>::run(mat) || mat.coeff(row, col);
+    return any_unroller<Derived, UnrollCount-1, InnerSize>::run(mat) || mat.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i);
   }
 };
 
-template<typename Derived, int Rows>
-struct any_unroller<Derived, 0, Rows>
+template<typename Derived, int InnerSize>
+struct any_unroller<Derived, 0, InnerSize>
 {
   EIGEN_DEVICE_FUNC static inline bool run(const Derived & /*mat*/) { return false; }
 };
 
-template<typename Derived, int Rows>
-struct any_unroller<Derived, Dynamic, Rows>
+template<typename Derived, int InnerSize>
+struct any_unroller<Derived, Dynamic, InnerSize>
 {
   EIGEN_DEVICE_FUNC static inline bool run(const Derived &) { return false; }
 };
@@ -81,16 +83,16 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const
   typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
   };
   Evaluator evaluator(derived());
   if(unroll)
-    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::all_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, InnerSizeAtCompileTime>::run(evaluator);
   else
   {
-    for(Index j = 0; j < cols(); ++j)
-      for(Index i = 0; i < rows(); ++i)
-        if (!evaluator.coeff(i, j)) return false;
+    for(Index i = 0; i < derived().outerSize(); ++i)
+      for(Index j = 0; j < derived().innerSize(); ++j)
+        if (!evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return false;
     return true;
   }
 }
@@ -105,16 +107,16 @@ EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const
   typedef internal::evaluator<Derived> Evaluator;
   enum {
     unroll = SizeAtCompileTime != Dynamic
-          && SizeAtCompileTime * (Evaluator::CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
+          && SizeAtCompileTime * (int(Evaluator::CoeffReadCost) + int(NumTraits<Scalar>::AddCost)) <= EIGEN_UNROLLING_LIMIT
   };
   Evaluator evaluator(derived());
   if(unroll)
-    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, internal::traits<Derived>::RowsAtCompileTime>::run(evaluator);
+    return internal::any_unroller<Evaluator, unroll ? int(SizeAtCompileTime) : Dynamic, InnerSizeAtCompileTime>::run(evaluator);
   else
   {
-    for(Index j = 0; j < cols(); ++j)
-      for(Index i = 0; i < rows(); ++i)
-        if (evaluator.coeff(i, j)) return true;
+    for(Index i = 0; i < derived().outerSize(); ++i)
+      for(Index j = 0; j < derived().innerSize(); ++j)
+        if (evaluator.coeff(IsRowMajor ? i : j, IsRowMajor ? j : i)) return true;
     return false;
   }
 }
@@ -156,7 +158,7 @@ inline bool DenseBase<Derived>::allFinite() const
   return !((derived()-derived()).hasNaN());
 #endif
 }
-    
+
 } // end namespace Eigen
 
 #endif // EIGEN_ALLANDANY_H
diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h
index 90c552f13e24bfd338705548d25136dcc494e79e..0ff8c8deb820deb7bbaa2f762afc5e4366f29616 100644
--- a/Eigen/src/Core/CoreEvaluators.h
+++ b/Eigen/src/Core/CoreEvaluators.h
@@ -561,7 +561,7 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
   typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
 
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
 
     Flags = evaluator<ArgType>::Flags
           & (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
@@ -606,13 +606,13 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased >
 protected:
 
   // this helper permits to completely eliminate the functor if it is empty
-  class Data : private UnaryOp
+  struct Data
   {
-  public:
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const UnaryOp& func() const { return static_cast<const UnaryOp&>(*this); }
+    const UnaryOp& func() const { return op; }
+    UnaryOp op;
     evaluator<ArgType> argImpl;
   };
 
@@ -639,7 +639,7 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
   typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
 
   enum {
-    CoeffReadCost = evaluator<Arg1>::CoeffReadCost + evaluator<Arg2>::CoeffReadCost + evaluator<Arg3>::CoeffReadCost + functor_traits<TernaryOp>::Cost,
+    CoeffReadCost = int(evaluator<Arg1>::CoeffReadCost) + int(evaluator<Arg2>::CoeffReadCost) + int(evaluator<Arg3>::CoeffReadCost) + int(functor_traits<TernaryOp>::Cost),
 
     Arg1Flags = evaluator<Arg1>::Flags,
     Arg2Flags = evaluator<Arg2>::Flags,
@@ -700,12 +700,13 @@ struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased
 
 protected:
   // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private TernaryOp
+  struct Data
   {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : TernaryOp(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
+    Data(const XprType& xpr) : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TernaryOp& func() const { return static_cast<const TernaryOp&>(*this); }
+    const TernaryOp& func() const { return op; }
+    TernaryOp op;
     evaluator<Arg1> arg1Impl;
     evaluator<Arg2> arg2Impl;
     evaluator<Arg3> arg3Impl;
@@ -735,7 +736,7 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
   typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
 
   enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
 
     LhsFlags = evaluator<Lhs>::Flags,
     RhsFlags = evaluator<Rhs>::Flags,
@@ -793,12 +794,13 @@ struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBase
 protected:
 
   // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private BinaryOp
+  struct Data
   {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : BinaryOp(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
+    Data(const XprType& xpr) : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const BinaryOp& func() const { return static_cast<const BinaryOp&>(*this); }
+    const BinaryOp& func() const { return op; }
+    BinaryOp op;
     evaluator<Lhs> lhsImpl;
     evaluator<Rhs> rhsImpl;
   };
@@ -815,7 +817,7 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
   typedef CwiseUnaryView<UnaryOp, ArgType> XprType;
 
   enum {
-    CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
 
     Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
 
@@ -858,12 +860,13 @@ struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType>, IndexBased>
 protected:
 
   // this helper permits to completely eliminate the functor if it is empty
-  struct Data : private UnaryOp
+  struct Data
   {
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    Data(const XprType& xpr) : UnaryOp(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    Data(const XprType& xpr) : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const UnaryOp& func() const { return static_cast<const UnaryOp&>(*this); }
+    const UnaryOp& func() const { return op; }
+    UnaryOp op;
     evaluator<ArgType> argImpl;
   };
 
diff --git a/Eigen/src/Core/CwiseBinaryOp.h b/Eigen/src/Core/CwiseBinaryOp.h
index 59974a545177dd4e74fc351724bbb6eddce06ce3..2202b1cc6b78c19de49e23ab81fd6e0982372996 100644
--- a/Eigen/src/Core/CwiseBinaryOp.h
+++ b/Eigen/src/Core/CwiseBinaryOp.h
@@ -102,7 +102,7 @@ class CwiseBinaryOp :
 
 #if EIGEN_COMP_MSVC && EIGEN_HAS_CXX11
     //Required for Visual Studio or the Copy constructor will probably not get inlined!
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    EIGEN_STRONG_INLINE
     CwiseBinaryOp(const CwiseBinaryOp<BinaryOp,LhsType,RhsType>&) = default;
 #endif
 
diff --git a/Eigen/src/Core/DenseBase.h b/Eigen/src/Core/DenseBase.h
index 9b16db68d48d2bfbf7e5c076ef2ba1504074f3f3..cdd0f5f168ea51f446128d57b444eae5d5db2c70 100644
--- a/Eigen/src/Core/DenseBase.h
+++ b/Eigen/src/Core/DenseBase.h
@@ -324,9 +324,9 @@ template<typename Derived> class DenseBase
     typedef Transpose<Derived> TransposeReturnType;
     EIGEN_DEVICE_FUNC
     TransposeReturnType transpose();
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<const Derived> ConstTransposeReturnType;
     EIGEN_DEVICE_FUNC
-    ConstTransposeReturnType transpose() const;
+    const ConstTransposeReturnType transpose() const;
     EIGEN_DEVICE_FUNC
     void transposeInPlace();
 
diff --git a/Eigen/src/Core/DenseStorage.h b/Eigen/src/Core/DenseStorage.h
index f6e1d0af1a9254e0e442c7846be419eb5b24cefd..08ef6c530617401094344790e64a7ac21addb2ad 100644
--- a/Eigen/src/Core/DenseStorage.h
+++ b/Eigen/src/Core/DenseStorage.h
@@ -163,6 +163,30 @@ struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
   EIGEN_DEVICE_FUNC plain_array(constructor_without_unaligned_array_assert) {}
 };
 
+struct plain_array_helper {
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void copy(const plain_array<T, Size, MatrixOrArrayOptions, Alignment>& src, const Eigen::Index size,
+                         plain_array<T, Size, MatrixOrArrayOptions, Alignment>& dst) {
+    smart_copy(src.array, src.array + size, dst.array);
+  }
+  
+  template<typename T, int Size, int MatrixOrArrayOptions, int Alignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  static void swap(plain_array<T, Size, MatrixOrArrayOptions, Alignment>& a, const Eigen::Index a_size,
+                   plain_array<T, Size, MatrixOrArrayOptions, Alignment>& b, const Eigen::Index b_size) {
+    if (a_size < b_size) {
+      std::swap_ranges(b.array, b.array + a_size, a.array);
+      smart_move(b.array + a_size, b.array + b_size, a.array + a_size);
+    } else if (a_size > b_size) {
+      std::swap_ranges(a.array, a.array + b_size, b.array);
+      smart_move(a.array + b_size, a.array + a_size, b.array + b_size);
+    } else {
+      std::swap_ranges(a.array, a.array + a_size, b.array);
+    }
+  }
+};
+
 } // end namespace internal
 
 /** \internal
@@ -190,17 +214,26 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
     EIGEN_DEVICE_FUNC
     explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()) {}
+#if !EIGEN_HAS_CXX11 || defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
     EIGEN_DEVICE_FUNC
     DenseStorage(const DenseStorage& other) : m_data(other.m_data) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
     }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage&) = default;
+#endif
+#if !EIGEN_HAS_CXX11
     EIGEN_DEVICE_FUNC
     DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other) m_data = other.m_data;
       return *this;
     }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage&) = default;
+#endif
 #if EIGEN_HAS_RVALUE_REFERENCES
+#if !EIGEN_HAS_CXX11
     EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&& other) EIGEN_NOEXCEPT
       : m_data(std::move(other.m_data))
     {
@@ -211,6 +244,10 @@ template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseSt
         m_data = std::move(other.m_data);
       return *this;
     }
+#else
+    EIGEN_DEVICE_FUNC DenseStorage(DenseStorage&&) = default;
+    EIGEN_DEVICE_FUNC DenseStorage& operator=(DenseStorage&&) = default;
+#endif
 #endif
     EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) {
       EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
@@ -268,21 +305,25 @@ template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic
     EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0), m_cols(0) {}
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
+    }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_rows = other.m_rows;
         m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, m_rows * m_cols, m_data);
       }
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index cols) : m_rows(rows), m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
     {
-      numext::swap(m_data,other.m_data);
+      internal::plain_array_helper::swap(m_data, m_rows * m_cols, other.m_data, other.m_rows * other.m_cols);
       numext::swap(m_rows,other.m_rows);
       numext::swap(m_cols,other.m_cols);
     }
@@ -303,21 +344,26 @@ template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Si
     EIGEN_DEVICE_FUNC DenseStorage() : m_rows(0) {}
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other)
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(other.m_rows)
+    {
+      internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
+    }
+    
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_rows = other.m_rows;
+        internal::plain_array_helper::copy(other.m_data, m_rows * _Cols, m_data);
       }
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index rows, Index) : m_rows(rows) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other)
-    {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_rows,other.m_rows);
+    { 
+      internal::plain_array_helper::swap(m_data, m_rows * _Cols, other.m_data, other.m_rows * _Cols);
+      numext::swap(m_rows, other.m_rows);
     }
     EIGEN_DEVICE_FUNC Index rows(void) const EIGEN_NOEXCEPT {return m_rows;}
     EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index cols(void) const EIGEN_NOEXCEPT {return _Cols;}
@@ -336,20 +382,24 @@ template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Si
     EIGEN_DEVICE_FUNC DenseStorage() : m_cols(0) {}
     EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert)
       : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
+    EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) 
+      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(other.m_cols)
+    {
+      internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
+    }
     EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other)
     {
       if (this != &other)
       {
-        m_data = other.m_data;
         m_cols = other.m_cols;
+        internal::plain_array_helper::copy(other.m_data, _Rows * m_cols, m_data);
       }
       return *this;
     }
     EIGEN_DEVICE_FUNC DenseStorage(Index, Index, Index cols) : m_cols(cols) {}
     EIGEN_DEVICE_FUNC void swap(DenseStorage& other) {
-      numext::swap(m_data,other.m_data);
-      numext::swap(m_cols,other.m_cols);
+      internal::plain_array_helper::swap(m_data, _Rows * m_cols, other.m_data, _Rows * other.m_cols);
+      numext::swap(m_cols, other.m_cols);
     }
     EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR Index rows(void) const EIGEN_NOEXCEPT {return _Rows;}
     EIGEN_DEVICE_FUNC Index cols(void) const EIGEN_NOEXCEPT {return m_cols;}
diff --git a/Eigen/src/Core/Diagonal.h b/Eigen/src/Core/Diagonal.h
index 3112d2c16a9b715d3f96f1bc658ccc12f8d62dd7..ad5bccd71b76c2136d6a804f0edad4e44bd5e850 100644
--- a/Eigen/src/Core/Diagonal.h
+++ b/Eigen/src/Core/Diagonal.h
@@ -191,7 +191,8 @@ MatrixBase<Derived>::diagonal()
 
 /** This is the const version of diagonal(). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalReturnType
+EIGEN_DEVICE_FUNC inline
+const typename MatrixBase<Derived>::ConstDiagonalReturnType
 MatrixBase<Derived>::diagonal() const
 {
   return ConstDiagonalReturnType(derived());
@@ -209,18 +210,18 @@ MatrixBase<Derived>::diagonal() const
   *
   * \sa MatrixBase::diagonal(), class Diagonal */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline Diagonal<Derived, DynamicIndex>
 MatrixBase<Derived>::diagonal(Index index)
 {
-  return DiagonalDynamicIndexReturnType(derived(), index);
+  return Diagonal<Derived, DynamicIndex>(derived(), index);
 }
 
 /** This is the const version of diagonal(Index). */
 template<typename Derived>
-EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
+EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, DynamicIndex>
 MatrixBase<Derived>::diagonal(Index index) const
 {
-  return ConstDiagonalDynamicIndexReturnType(derived(), index);
+  return Diagonal<const Derived, DynamicIndex>(derived(), index);
 }
 
 /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
@@ -237,20 +238,20 @@ MatrixBase<Derived>::diagonal(Index index) const
 template<typename Derived>
 template<int Index_>
 EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index_>::Type
+inline Diagonal<Derived, Index_>
 MatrixBase<Derived>::diagonal()
 {
-  return typename DiagonalIndexReturnType<Index_>::Type(derived());
+  return Diagonal<Derived, Index_>(derived());
 }
 
 /** This is the const version of diagonal<int>(). */
 template<typename Derived>
 template<int Index_>
 EIGEN_DEVICE_FUNC
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index_>::Type
+inline const Diagonal<const Derived, Index_>
 MatrixBase<Derived>::diagonal() const
 {
-  return typename ConstDiagonalIndexReturnType<Index_>::Type(derived());
+  return  Diagonal<const Derived, Index_>(derived());
 }
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/Dot.h b/Eigen/src/Core/Dot.h
index 41a8cb437bc3f6ffdff988b49cbd0cff699d3d8d..abac7ad48ed85befd608a2522fdbdb187187dc6f 100644
--- a/Eigen/src/Core/Dot.h
+++ b/Eigen/src/Core/Dot.h
@@ -18,14 +18,9 @@ namespace internal {
 // with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
 // looking at the static assertions. Thus this is a trick to get better compile errors.
 template<typename T, typename U,
-// the NeedToTranspose condition here is taken straight from Assign.h
-         bool NeedToTranspose = T::IsVectorAtCompileTime
-                && U::IsVectorAtCompileTime
-                && ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1)
-                      |  // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                         // revert to || as soon as not needed anymore.
-                    (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))
->
+         bool NeedToTranspose = T::IsVectorAtCompileTime && U::IsVectorAtCompileTime &&
+                ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1) ||
+                 (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))>
 struct dot_nocheck
 {
   typedef scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> conj_prod;
@@ -86,7 +81,7 @@ MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
 
 //---------- implementation of L2 norm and related functions ----------
 
-/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
+/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
   * In both cases, it consists in the sum of the square of all the matrix entries.
   * For vectors, this is also equals to the dot product of \c *this with itself.
   *
diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h
index 53800a005771016cac01660ede7d818e4cbe1608..cf677a1905f1e80697eb6c0daccb48f745c605e3 100644
--- a/Eigen/src/Core/GenericPacketMath.h
+++ b/Eigen/src/Core/GenericPacketMath.h
@@ -129,6 +129,22 @@ template<typename T> struct packet_traits : default_packet_traits
 
 template<typename T> struct packet_traits<const T> : packet_traits<T> { };
 
+template<typename T> struct unpacket_traits
+{
+  typedef T type;
+  typedef T half;
+  enum
+  {
+    size = 1,
+    alignment = 1,
+    vectorizable = false,
+    masked_load_available=false,
+    masked_store_available=false
+  };
+};
+
+template<typename T> struct unpacket_traits<const T> : unpacket_traits<T> { };
+
 template <typename Src, typename Tgt> struct type_casting_traits {
   enum {
     VectorizedCast = 0,
@@ -154,6 +170,18 @@ struct eigen_packet_wrapper
   T m_val;
 };
 
+
+/** \internal A convenience utility for determining if the type is a scalar.
+ * This is used to enable some generic packet implementations.
+ */
+template<typename Packet>
+struct is_scalar {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  enum {
+    value = internal::is_same<Packet, Scalar>::value
+  };
+};
+
 /** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
 template <typename SrcPacket, typename TgtPacket>
 EIGEN_DEVICE_FUNC inline TgtPacket
@@ -215,13 +243,59 @@ pmul(const bool& a, const bool& b) { return a && b; }
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pdiv(const Packet& a, const Packet& b) { return a/b; }
 
-/** \internal \returns one bits */
+// In the generic case, memset to all one bits.
+template<typename Packet, typename EnableIf = void>
+struct ptrue_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/){
+    Packet b;
+    memset(static_cast<void*>(&b), 0xff, sizeof(Packet));
+    return b;
+  }
+};
+
+// For non-trivial scalars, set to Scalar(1) (i.e. a non-zero value).
+// Although this is technically not a valid bitmask, the scalar path for pselect
+// uses a comparison to zero, so this should still work in most cases. We don't
+// have another option, since the scalar type requires initialization.
+template<typename T>
+struct ptrue_impl<T, 
+    typename internal::enable_if<is_scalar<T>::value && NumTraits<T>::RequireInitialization>::type > {
+  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/){
+    return T(1);
+  }
+};
+
+/** \internal \returns one bits. */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-ptrue(const Packet& /*a*/) { Packet b; memset((void*)&b, 0xff, sizeof(b)); return b;}
+ptrue(const Packet& a) {
+  return ptrue_impl<Packet>::run(a);
+}
 
-/** \internal \returns zero bits */
+// In the general case, memset to zero.
+template<typename Packet, typename EnableIf = void>
+struct pzero_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
+    Packet b;
+    memset(static_cast<void*>(&b), 0x00, sizeof(Packet));
+    return b;
+  }
+};
+
+// For scalars, explicitly set to Scalar(0), since the underlying representation
+// for zero may not consist of all-zero bits.
+template<typename T>
+struct pzero_impl<T,
+    typename internal::enable_if<is_scalar<T>::value>::type> {
+  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) {
+    return T(0);
+  }
+};
+
+/** \internal \returns packet of zeros */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pzero(const Packet& /*a*/) { Packet b; memset((void*)&b, 0, sizeof(b)); return b;}
+pzero(const Packet& a) {
+  return pzero_impl<Packet>::run(a);
+}
 
 /** \internal \returns a <= b as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
@@ -238,33 +312,6 @@ pcmp_eq(const Packet& a, const Packet& b) { return a==b ? ptrue(a) : pzero(a); }
 /** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pcmp_lt_or_nan(const Packet& a, const Packet& b) { return a>=b ? pzero(a) : ptrue(a); }
-template<> EIGEN_DEVICE_FUNC inline float pzero<float>(const float& a) {
-  EIGEN_UNUSED_VARIABLE(a)
-  return 0.f;
-}
-
-template<> EIGEN_DEVICE_FUNC inline double pzero<double>(const double& a) {
-  EIGEN_UNUSED_VARIABLE(a)
-  return 0.;
-}
-
-template <typename RealScalar>
-EIGEN_DEVICE_FUNC inline std::complex<RealScalar> ptrue(const std::complex<RealScalar>& /*a*/) {
-  RealScalar b = ptrue(RealScalar(0));
-  return std::complex<RealScalar>(b, b);
-}
-
-template <typename Packet, typename Op>
-EIGEN_DEVICE_FUNC inline Packet bitwise_helper(const Packet& a, const Packet& b, Op op) {
-  const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
-  const unsigned char* b_ptr = reinterpret_cast<const unsigned char*>(&b);
-  Packet c;
-  unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
-  for (size_t i = 0; i < sizeof(Packet); ++i) {
-    *c_ptr++ = op(*a_ptr++, *b_ptr++);
-  }
-  return c;
-}
 
 template<typename T>
 struct bit_and {
@@ -287,42 +334,123 @@ struct bit_xor {
   }
 };
 
+template<typename T>
+struct bit_not {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR EIGEN_ALWAYS_INLINE T operator()(const T& a) const {
+    return ~a;
+  }
+};
+
+// Use operators &, |, ^, ~.
+template<typename T>
+struct operator_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not<T>()(a); }
+};
+
+// Apply binary operations byte-by-byte
+template<typename T>
+struct bytewise_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) {
+    return binary(a, b, bit_and<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { 
+    return binary(a, b, bit_or<unsigned char>());
+   }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) {
+    return binary(a, b, bit_xor<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { 
+    return unary(a,bit_not<unsigned char>());
+   }
+  
+ private:
+  template<typename Op>
+  EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++);
+    }
+    return c;
+  }
+
+  template<typename Op>
+  EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    const unsigned char* b_ptr = reinterpret_cast<const unsigned char*>(&b);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++, *b_ptr++);
+    }
+    return c;
+  }
+};
+
+// In the general case, use byte-by-byte manipulation.
+template<typename T, typename EnableIf = void>
+struct bitwise_helper : public bytewise_bitwise_helper<T> {};
+
+// For integers or non-trivial scalars, use binary operators.
+template<typename T>
+struct bitwise_helper<T,
+  typename internal::enable_if<
+    is_scalar<T>::value && (NumTraits<T>::IsInteger || NumTraits<T>::RequireInitialization)>::type
+  > : public operator_bitwise_helper<T> {};
+
 /** \internal \returns the bitwise and of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pand(const Packet& a, const Packet& b) {
-  return bitwise_helper(a, b, bit_and<unsigned char>());
+  return bitwise_helper<Packet>::bitwise_and(a, b);
 }
 
 /** \internal \returns the bitwise or of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 por(const Packet& a, const Packet& b) {
-  return bitwise_helper(a ,b, bit_or<unsigned char>());
+  return bitwise_helper<Packet>::bitwise_or(a, b);
 }
 
 /** \internal \returns the bitwise xor of \a a and \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pxor(const Packet& a, const Packet& b) {
-  return bitwise_helper(a ,b, bit_xor<unsigned char>());
+  return bitwise_helper<Packet>::bitwise_xor(a, b);
+}
+
+/** \internal \returns the bitwise not of \a a */
+template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
+pnot(const Packet& a) {
+  return bitwise_helper<Packet>::bitwise_not(a);
 }
 
 /** \internal \returns the bitwise and of \a a and not \a b */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
-pandnot(const Packet& a, const Packet& b) { return pand(a, pxor(ptrue(b), b)); }
+pandnot(const Packet& a, const Packet& b) { return pand(a, pnot(b)); }
+
+// In the general case, use bitwise select.
+template<typename Packet, typename EnableIf = void>
+struct pselect_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return por(pand(a,mask),pandnot(b,mask));
+  }
+};
+
+// For scalars, use ternary select.
+template<typename Packet>
+struct pselect_impl<Packet, 
+    typename internal::enable_if<is_scalar<Packet>::value>::type > {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return numext::equal_strict(mask, Packet(0)) ? b : a;
+  }
+};
 
 /** \internal \returns \a or \b for each field in packet according to \mask */
 template<typename Packet> EIGEN_DEVICE_FUNC inline Packet
 pselect(const Packet& mask, const Packet& a, const Packet& b) {
-  return por(pand(a,mask),pandnot(b,mask));
-}
-
-template<> EIGEN_DEVICE_FUNC inline float pselect<float>(
-    const float& cond, const float& a, const float&b) {
-  return numext::equal_strict(cond,0.f) ? b : a;
-}
-
-template<> EIGEN_DEVICE_FUNC inline double pselect<double>(
-    const double& cond, const double& a, const double& b) {
-  return numext::equal_strict(cond,0.) ? b : a;
+  return pselect_impl<Packet>::run(mask, a, b);
 }
 
 template<> EIGEN_DEVICE_FUNC inline bool pselect<bool>(
diff --git a/Eigen/src/Core/IndexedView.h b/Eigen/src/Core/IndexedView.h
index 08476251d32234dfdd2e9b0cba788c58484bc458..05c2bc9cc632c9fb0fdcbddda604dca586c77808 100644
--- a/Eigen/src/Core/IndexedView.h
+++ b/Eigen/src/Core/IndexedView.h
@@ -122,10 +122,10 @@ public:
   {}
 
   /** \returns number of rows */
-  Index rows() const { return internal::size(m_rowIndices); }
+  Index rows() const { return internal::index_list_size(m_rowIndices); }
 
   /** \returns number of columns */
-  Index cols() const { return internal::size(m_colIndices); }
+  Index cols() const { return internal::index_list_size(m_colIndices); }
 
   /** \returns the nested expression */
   const typename internal::remove_all<XprType>::type&
@@ -189,12 +189,16 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   CoeffReturnType coeff(Index row, Index col) const
   {
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
+                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
     return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Scalar& coeffRef(Index row, Index col)
   {
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
+                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
     return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
@@ -204,6 +208,8 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
     EIGEN_STATIC_ASSERT_LVALUE(XprType)
     Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
     Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
+                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
     return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
@@ -212,6 +218,8 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
   {
     Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
     Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
+                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
     return m_argImpl.coeffRef( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
@@ -220,6 +228,8 @@ struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
   {
     Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
     Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows()
+                 && m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
     return m_argImpl.coeff( m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
   }
 
diff --git a/Eigen/src/Core/Map.h b/Eigen/src/Core/Map.h
index 93d2ae9071d599188ad031930d719222c8d36e18..218cc157f386924cae44ca6741d6aaba9fc96f74 100644
--- a/Eigen/src/Core/Map.h
+++ b/Eigen/src/Core/Map.h
@@ -47,7 +47,7 @@ private:
   * \brief A matrix or vector expression mapping an existing array of data.
   *
   * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
+  * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32, \c #Aligned16, \c #Aligned8 or \c #Unaligned.
   *                The default is \c #Unaligned.
   * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
   *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 29201214f0ed5ab1c5238e8761e9bb80bf21e66b..764c41c974f3f92efb971f216636c02500145664 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -260,19 +261,8 @@ struct conj_default_impl<Scalar,true>
   }
 };
 
-template<typename Scalar> struct conj_impl : conj_default_impl<Scalar> {};
-
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-template<typename T>
-struct conj_impl<std::complex<T> >
-{
-  EIGEN_DEVICE_FUNC
-  static inline std::complex<T> run(const std::complex<T>& x)
-  {
-    return std::complex<T>(x.real(), -x.imag());
-  }
-};
-#endif
+template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_impl : conj_default_impl<Scalar, IsComplex> {};
 
 template<typename Scalar>
 struct conj_retval
@@ -582,7 +572,9 @@ struct rint_retval
 * Implementation of arg                                                     *
 ****************************************************************************/
 
-#if EIGEN_HAS_CXX11_MATH
+// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs.
+// This seems to be fixed in VS 2019.
+#if EIGEN_HAS_CXX11_MATH && (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920)
 // std::arg is only defined for types of std::complex, or integer types or float/double/long double
 template<typename Scalar,
           bool HasStdImpl = NumTraits<Scalar>::IsComplex || is_integral<Scalar>::value
@@ -592,16 +584,13 @@ struct arg_default_impl;
 
 template<typename Scalar>
 struct arg_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
   EIGEN_DEVICE_FUNC
-  static inline Scalar run(const Scalar& x)
+  static inline RealScalar run(const Scalar& x)
   {
-    #if defined(EIGEN_HIP_DEVICE_COMPILE)
-    // HIP does not seem to have a native device side implementation for the math routine "arg"
+    // There is no official ::arg on device in CUDA/HIP, so we always need to use std::arg.
     using std::arg;
-    #else
-    EIGEN_USING_STD(arg);
-    #endif
-    return static_cast<Scalar>(arg(x));
+    return static_cast<RealScalar>(arg(x));
   }
 };
 
@@ -612,7 +601,7 @@ struct arg_default_impl<Scalar, false> {
   EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
-    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
+    return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
   }
 };
 #else
@@ -623,7 +612,7 @@ struct arg_default_impl
   EIGEN_DEVICE_FUNC
   static inline RealScalar run(const Scalar& x)
   {
-    return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0);
+    return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
   }
 };
 
@@ -697,6 +686,30 @@ struct expm1_retval
   typedef Scalar type;
 };
 
+/****************************************************************************
+* Implementation of log                                                     *
+****************************************************************************/
+
+// Complex log defined in MathFunctionsImpl.h.
+template<typename T> EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z);
+
+template<typename Scalar>
+struct log_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x)
+  {
+    EIGEN_USING_STD(log);
+    return static_cast<Scalar>(log(x));
+  }
+};
+
+template<typename Scalar>
+struct log_impl<std::complex<Scalar> > {
+  EIGEN_DEVICE_FUNC static inline std::complex<Scalar> run(const std::complex<Scalar>& z)
+  {
+    return complex_log(z);
+  }
+};
+
 /****************************************************************************
 * Implementation of log1p                                                   *
 ****************************************************************************/
@@ -710,7 +723,7 @@ namespace std_fallback {
     typedef typename NumTraits<Scalar>::Real RealScalar;
     EIGEN_USING_STD(log);
     Scalar x1p = RealScalar(1) + x;
-    Scalar log_1p = log(x1p);
+    Scalar log_1p = log_impl<Scalar>::run(x1p);
     const bool is_small = numext::equal_strict(x1p, Scalar(1));
     const bool is_inf = numext::equal_strict(x1p, log_1p);
     return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
@@ -864,13 +877,159 @@ struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus>
   // no value, error at compile time
 };
 
-template<typename Scalar>
-struct random_default_impl<Scalar, false, true>
-{
-  static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    if (y <= x)
-      return x;
+template <typename BitsType, typename EnableIf = void>
+struct count_bits_impl {
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(
+        is_integral<BitsType>::value && !NumTraits<BitsType>::IsSigned,
+        THIS_TYPE_IS_NOT_SUPPORTED);
+    int n = CHAR_BIT * sizeof(BitsType);
+    int shift = n / 2;
+    while (bits > 0 && shift > 0) {
+      BitsType y = bits >> shift;
+      if (y > 0) {
+        n -= shift;
+        bits = y;
+      }
+      shift /= 2;
+    }
+    if (shift == 0) {
+      --n;
+    }
+    return n;
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(
+        is_integral<BitsType>::value && !NumTraits<BitsType>::IsSigned,
+        THIS_TYPE_IS_NOT_SUPPORTED);
+    int n = CHAR_BIT * sizeof(BitsType);
+    int shift = n / 2;
+    while (bits > 0 && shift > 0) {
+      BitsType y = bits << shift;
+      if (y > 0) {
+        n -= shift;
+        bits = y;
+      }
+      shift /= 2;
+    }
+    if (shift == 0) {
+      --n;
+    }
+    return n;
+  }
+};
+
+// Count leading zeros.
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+  return count_bits_impl<BitsType>::clz(bits);
+}
+
+// Count trailing zeros.
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+  return count_bits_impl<BitsType>::ctz(bits);
+}
+
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+
+template <typename BitsType>
+struct count_bits_impl<BitsType, typename enable_if<sizeof(BitsType) <= sizeof(unsigned int)>::type> {
+  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    static const int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT;
+    return bits == 0 ? kNumBits : __builtin_clz(static_cast<unsigned int>(bits)) - kLeadingBitsOffset;
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    return bits == 0 ? kNumBits : __builtin_ctz(static_cast<unsigned int>(bits));
+  }
+};
+
+template <typename BitsType>
+struct count_bits_impl<
+    BitsType, typename enable_if<sizeof(unsigned int) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(unsigned long)>::type> {
+  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    static const int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT;
+    return bits == 0 ? kNumBits : __builtin_clzl(static_cast<unsigned long>(bits)) - kLeadingBitsOffset;
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    return bits == 0 ? kNumBits : __builtin_ctzl(static_cast<unsigned long>(bits));
+  }
+};
+
+template <typename BitsType>
+struct count_bits_impl<BitsType, typename enable_if<sizeof(unsigned long) < sizeof(BitsType) &&
+                                                  sizeof(BitsType) <= sizeof(unsigned long long)>::type> {
+  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    static const int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT;
+    return bits == 0 ? kNumBits : __builtin_clzll(static_cast<unsigned long long>(bits)) - kLeadingBitsOffset;
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    return bits == 0 ? kNumBits : __builtin_ctzll(static_cast<unsigned long long>(bits));
+  }
+};
+
+#elif EIGEN_COMP_MSVC
+
+template <typename BitsType>
+struct count_bits_impl<BitsType, typename enable_if<sizeof(BitsType) <= sizeof(unsigned long)>::type> {
+  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    unsigned long out;
+    _BitScanReverse(&out, static_cast<unsigned long>(bits));
+    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    unsigned long out;
+    _BitScanForward(&out, static_cast<unsigned long>(bits));
+    return bits == 0 ? kNumBits : static_cast<int>(out);
+  }
+};
+
+#ifdef _WIN64
+
+template <typename BitsType>
+struct count_bits_impl<
+    BitsType, typename enable_if<sizeof(unsigned long) < sizeof(BitsType) && sizeof(BitsType) <= sizeof(__int64)>::type> {
+  static const int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    unsigned long out;
+    _BitScanReverse64(&out, static_cast<unsigned __int64>(bits));
+    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    EIGEN_STATIC_ASSERT(is_integral<BitsType>::value, THIS_TYPE_IS_NOT_SUPPORTED);
+    unsigned long out;
+    _BitScanForward64(&out, static_cast<unsigned __int64>(bits));
+    return bits == 0 ? kNumBits : static_cast<int>(out);
+  }
+};
+
+#endif  // _WIN64
+
+#endif  // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+
+template <typename Scalar>
+struct random_default_impl<Scalar, false, true> {
+  static inline Scalar run(const Scalar& x, const Scalar& y) {
+    if (y <= x) return x;
     // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself.
     typedef typename make_unsigned<Scalar>::type ScalarU;
     // ScalarX is the widest of ScalarU and unsigned int.
@@ -1015,11 +1174,15 @@ template<typename T> EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x)
 }
 
 //MSVC defines a _isnan builtin function, but for double only
+#ifndef EIGEN_GPU_COMPILE_PHASE
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { return _isnan(x)!=0; }
+#endif
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x)      { return _isnan(x)!=0; }
 EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x)       { return _isnan(x)!=0; }
 
+#ifndef EIGEN_GPU_COMPILE_PHASE
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { return isinf_msvc_helper(x); }
+#endif
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x)      { return isinf_msvc_helper(x); }
 EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_msvc_helper(x); }
 
@@ -1033,12 +1196,16 @@ EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x)       { return isinf_ms
   #define EIGEN_TMP_NOOPT_ATTRIB EIGEN_DEVICE_FUNC inline __attribute__((noinline,optimize("no-finite-math-only")))
 #endif
 
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { return __builtin_isnan(x); }
+#endif
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x)      { return __builtin_isnan(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x)       { return __builtin_isnan(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x)      { return __builtin_isinf(x); }
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x)       { return __builtin_isinf(x); }
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<> EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { return __builtin_isinf(x); }
+#endif
 
 #undef EIGEN_TMP_NOOPT_ATTRIB
 
@@ -1095,6 +1262,8 @@ EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y)
 {
   return fmin(x, y);
 }
+
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
@@ -1106,6 +1275,7 @@ EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y)
   return fminl(x, y);
 #endif
 }
+#endif
 
 template<typename T>
 EIGEN_DEVICE_FUNC
@@ -1125,6 +1295,7 @@ EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y)
 {
   return fmax(x, y);
 }
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
@@ -1137,6 +1308,7 @@ EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y)
 #endif
 }
 #endif
+#endif
 
 #if defined(SYCL_DEVICE_ONLY)
 
@@ -1293,8 +1465,8 @@ EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y)
   return fabs(x - y);
 }
 
-#if !defined(EIGEN_GPUCC)
 // HIP and CUDA do not support long double.
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<>
 EIGEN_DEVICE_FUNC
 EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) {
@@ -1470,8 +1642,7 @@ T rsqrt(const T& x)
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T log(const T &x) {
-  EIGEN_USING_STD(log);
-  return static_cast<T>(log(x));
+  return internal::log_impl<T>::run(x);
 }
 
 #if defined(SYCL_DEVICE_ONLY)
@@ -2022,6 +2193,18 @@ struct rsqrt_impl {
   }
 };
 
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template<typename T>
+struct conj_impl<std::complex<T>, true>
+{
+  EIGEN_DEVICE_FUNC
+  static inline std::complex<T> run(const std::complex<T>& x)
+  {
+    return std::complex<T>(numext::real(x), -numext::imag(x));
+  }
+};
+#endif
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/MathFunctionsImpl.h b/Eigen/src/Core/MathFunctionsImpl.h
index 0d3f317bbe4a312bdd017288c71e2101927c3c76..4eaaaa78449031830cea7446210226afcd2467d0 100644
--- a/Eigen/src/Core/MathFunctionsImpl.h
+++ b/Eigen/src/Core/MathFunctionsImpl.h
@@ -184,6 +184,15 @@ EIGEN_DEVICE_FUNC std::complex<T> complex_rsqrt(const std::complex<T>& z) {
       : std::complex<T>(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz );
 }
 
+template<typename T>
+EIGEN_DEVICE_FUNC std::complex<T> complex_log(const std::complex<T>& z) {
+  // Computes complex log.
+  T a = numext::abs(z);
+  EIGEN_USING_STD(atan2);
+  T b = atan2(z.imag(), z.real());
+  return std::complex<T>(numext::log(a), b);
+}
+
 } // end namespace internal
 
 } // end namespace Eigen
diff --git a/Eigen/src/Core/MatrixBase.h b/Eigen/src/Core/MatrixBase.h
index 45c3a596ecb09d5e55f4eaf3c742d0badb624373..d93a7e377ac1f1b83047e9e2220cc3f932fc9ad5 100644
--- a/Eigen/src/Core/MatrixBase.h
+++ b/Eigen/src/Core/MatrixBase.h
@@ -206,28 +206,22 @@ template<typename Derived> class MatrixBase
     EIGEN_DEVICE_FUNC
     DiagonalReturnType diagonal();
 
-    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
+    typedef Diagonal<const Derived> ConstDiagonalReturnType;
     EIGEN_DEVICE_FUNC
-    ConstDiagonalReturnType diagonal() const;
-
-    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
-    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };
+    const ConstDiagonalReturnType diagonal() const;
 
     template<int Index>
     EIGEN_DEVICE_FUNC
-    typename DiagonalIndexReturnType<Index>::Type diagonal();
+    Diagonal<Derived, Index> diagonal();
 
     template<int Index>
     EIGEN_DEVICE_FUNC
-    typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
-
-    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
-    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
+    const Diagonal<const Derived, Index> diagonal() const;
 
     EIGEN_DEVICE_FUNC
-    DiagonalDynamicIndexReturnType diagonal(Index index);
+    Diagonal<Derived, DynamicIndex> diagonal(Index index);
     EIGEN_DEVICE_FUNC
-    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
+    const Diagonal<const Derived, DynamicIndex> diagonal(Index index) const;
 
     template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
     template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
diff --git a/Eigen/src/Core/NumTraits.h b/Eigen/src/Core/NumTraits.h
index fdd4d4f519f19e2302ad488eec6e38e955b31012..e232651386446a53e5390ed9811277cbf2932eb1 100644
--- a/Eigen/src/Core/NumTraits.h
+++ b/Eigen/src/Core/NumTraits.h
@@ -245,12 +245,25 @@ template<> struct NumTraits<double> : GenericNumTraits<double>
   static inline double dummy_precision() { return 1e-12; }
 };
 
+// GPU devices treat `long double` as `double`.
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<> struct NumTraits<long double>
   : GenericNumTraits<long double>
 {
-  EIGEN_CONSTEXPR
-  static inline long double dummy_precision() { return 1e-15l; }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline long double dummy_precision() { return static_cast<long double>(1e-15l); }
+
+#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
+  // PowerPC double double causes issues with some values
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
+  static inline long double epsilon()
+  {
+    // 2^(-(__LDBL_MANT_DIG__)+1)
+    return static_cast<long double>(2.4651903288156618919116517665087e-32l);
+  }
+#endif
 };
+#endif
 
 template<typename _Real> struct NumTraits<std::complex<_Real> >
   : GenericNumTraits<std::complex<_Real> >
@@ -289,9 +302,9 @@ struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
     IsInteger = NumTraits<Scalar>::IsInteger,
     IsSigned  = NumTraits<Scalar>::IsSigned,
     RequireInitialization = 1,
-    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
-    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
-    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::ReadCost),
+    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::AddCost),
+    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? HugeCost : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost)
   };
 
   EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h
index 0be694259fe4d43305550ea7741eadea3a3f4bb9..17c06f0783dcb97cc019e83620df49cd60815817 100644
--- a/Eigen/src/Core/PartialReduxEvaluator.h
+++ b/Eigen/src/Core/PartialReduxEvaluator.h
@@ -54,12 +54,17 @@ struct packetwise_redux_traits
 /* Value to be returned when size==0 , by default let's return 0 */
 template<typename PacketType,typename Func>
 EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const Func& ) { return pset1<PacketType>(0); }
+PacketType packetwise_redux_empty_value(const Func& ) {
+  const typename unpacket_traits<PacketType>::type zero(0);
+  return pset1<PacketType>(zero);
+}
 
 /* For products the default is 1 */
 template<typename PacketType,typename Scalar>
 EIGEN_DEVICE_FUNC
-PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) { return pset1<PacketType>(1); }
+PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar,Scalar>& ) {
+  return pset1<PacketType>(Scalar(1));
+}
 
 /* Perform the actual reduction */
 template<typename Func, typename Evaluator,
@@ -145,7 +150,7 @@ struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
   enum {
     CoeffReadCost = TraversalSize==Dynamic ? HugeCost
                   : TraversalSize==0 ? 1
-                  : TraversalSize * evaluator<ArgType>::CoeffReadCost + int(CostOpType::value),
+                  : int(TraversalSize) * int(evaluator<ArgType>::CoeffReadCost) + int(CostOpType::value),
     
     _ArgFlags = evaluator<ArgType>::Flags,
 
diff --git a/Eigen/src/Core/PlainObjectBase.h b/Eigen/src/Core/PlainObjectBase.h
index 202ed7100656bbdfd171be342461423f65b325b7..e2ddbd1d522a283a5992a504f023eebc48a05670 100644
--- a/Eigen/src/Core/PlainObjectBase.h
+++ b/Eigen/src/Core/PlainObjectBase.h
@@ -1019,7 +1019,7 @@ struct conservative_resize_like_impl
     else
     {
       // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(rows,cols);
+      Derived tmp(rows,cols);
       const Index common_rows = numext::mini(rows, _this.rows());
       const Index common_cols = numext::mini(cols, _this.cols());
       tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
@@ -1054,7 +1054,7 @@ struct conservative_resize_like_impl
     else
     {
       // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(other);
+      Derived tmp(other);
       const Index common_rows = numext::mini(tmp.rows(), _this.rows());
       const Index common_cols = numext::mini(tmp.cols(), _this.cols());
       tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
index b766e1a1d643bfe5b4e7b2a574f839cf37ad6883..8cf294b287bc9a458cd9b5f0bb4c03e4a5b51126 100644
--- a/Eigen/src/Core/ProductEvaluators.h
+++ b/Eigen/src/Core/ProductEvaluators.h
@@ -831,7 +831,7 @@ struct diagonal_product_evaluator_base
    typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
 public:
   enum {
-    CoeffReadCost = NumTraits<Scalar>::MulCost + evaluator<MatrixType>::CoeffReadCost + evaluator<DiagonalType>::CoeffReadCost,
+    CoeffReadCost = int(NumTraits<Scalar>::MulCost) + int(evaluator<MatrixType>::CoeffReadCost) + int(evaluator<DiagonalType>::CoeffReadCost),
 
     MatrixFlags = evaluator<MatrixType>::Flags,
     DiagFlags = evaluator<DiagonalType>::Flags,
diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h
index 30598f4158decd1900e2c2f3667a6ff5396d4dd2..b6790d11050bb340c83acaa72197eba395c80222 100644
--- a/Eigen/src/Core/Redux.h
+++ b/Eigen/src/Core/Redux.h
@@ -58,7 +58,7 @@ public:
 public:
   enum {
     Cost = Evaluator::SizeAtCompileTime == Dynamic ? HugeCost
-         : Evaluator::SizeAtCompileTime * Evaluator::CoeffReadCost + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+         : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) + (Evaluator::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
@@ -331,7 +331,7 @@ struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling>
   enum {
     PacketSize = redux_traits<Func, Evaluator>::PacketSize,
     Size = Evaluator::SizeAtCompileTime,
-    VectorizedSize = (Size / PacketSize) * PacketSize
+    VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize)
   };
 
   template<typename XprType>
diff --git a/Eigen/src/Core/Reshaped.h b/Eigen/src/Core/Reshaped.h
index 52de73b6fc371b8cbd45e13599d7c49b790903f7..882314cfe70089b12a588fec41318cc3a75fc733 100644
--- a/Eigen/src/Core/Reshaped.h
+++ b/Eigen/src/Core/Reshaped.h
@@ -250,7 +250,7 @@ class ReshapedImpl_dense<XprType, Rows, Cols, Order, true>
     EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index outerStride() const
     {
-      return ((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows();
+      return (((Flags&RowMajorBit)==RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride();
     }
 
   protected:
diff --git a/Eigen/src/Core/SelfAdjointView.h b/Eigen/src/Core/SelfAdjointView.h
index b7ed6f1cdcfbe5d0bc59dbbdf8270a03a20a9f83..8ce3b372a0ff792b98b8b1b00f1dd55742779bd0 100644
--- a/Eigen/src/Core/SelfAdjointView.h
+++ b/Eigen/src/Core/SelfAdjointView.h
@@ -66,7 +66,7 @@ template<typename _MatrixType, unsigned int UpLo> class SelfAdjointView
     enum {
       Mode = internal::traits<SelfAdjointView>::Mode,
       Flags = internal::traits<SelfAdjointView>::Flags,
-      TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0)
+      TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0)
     };
     typedef typename MatrixType::PlainObject PlainObject;
 
diff --git a/Eigen/src/Core/SolveTriangular.h b/Eigen/src/Core/SolveTriangular.h
index 38794447564d717dec9eb31e3845fe44787f6dd0..dfbf99523a9c9cb5328a20e7d0f80574a4ad2cbf 100644
--- a/Eigen/src/Core/SolveTriangular.h
+++ b/Eigen/src/Core/SolveTriangular.h
@@ -168,7 +168,7 @@ EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType,Mode,Dense>::solveInPlace(c
 {
   OtherDerived& other = _other.const_cast_derived();
   eigen_assert( derived().cols() == derived().rows() && ((Side==OnTheLeft && derived().cols() == other.rows()) || (Side==OnTheRight && derived().cols() == other.cols())) );
-  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+  eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower))));
   // If solving for a 0x0 matrix, nothing to do, simply return.
   if (derived().cols() == 0)
     return;
diff --git a/Eigen/src/Core/SolverBase.h b/Eigen/src/Core/SolverBase.h
index 5014610420f3e8cea48f0b3fef0a345f1a920f8d..e38b3d5ad603389ae4036bc5de00b19aafff2d62 100644
--- a/Eigen/src/Core/SolverBase.h
+++ b/Eigen/src/Core/SolverBase.h
@@ -110,7 +110,7 @@ class SolverBase : public EigenBase<Derived>
     }
 
     /** \internal the return type of transpose() */
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<const Derived> ConstTransposeReturnType;
     /** \returns an expression of the transposed of the factored matrix.
       *
       * A typical usage is to solve for the transposed problem A^T x = b:
@@ -118,16 +118,16 @@ class SolverBase : public EigenBase<Derived>
       *
       * \sa adjoint(), solve()
       */
-    inline ConstTransposeReturnType transpose() const
+    inline const ConstTransposeReturnType transpose() const
     {
       return ConstTransposeReturnType(derived());
     }
 
     /** \internal the return type of adjoint() */
     typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
-                        ConstTransposeReturnType
-                     >::type AdjointReturnType;
+               CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const ConstTransposeReturnType>,
+               const ConstTransposeReturnType
+            >::type AdjointReturnType;
     /** \returns an expression of the adjoint of the factored matrix
       *
       * A typical usage is to solve for the adjoint problem A' x = b:
@@ -137,7 +137,7 @@ class SolverBase : public EigenBase<Derived>
       *
       * \sa transpose(), solve()
       */
-    inline AdjointReturnType adjoint() const
+    inline const AdjointReturnType adjoint() const
     {
       return AdjointReturnType(derived().transpose());
     }
diff --git a/Eigen/src/Core/Stride.h b/Eigen/src/Core/Stride.h
index 6494d51420a9b3eb195218434ace8b9f9c83d8eb..d164e5399697c00baa9d1b4f9156b2ba3d785a24 100644
--- a/Eigen/src/Core/Stride.h
+++ b/Eigen/src/Core/Stride.h
@@ -38,10 +38,14 @@ namespace Eigen {
   * \include Map_general_stride.cpp
   * Output: \verbinclude Map_general_stride.out
   *
-  * Both strides can be negative, however, a negative stride of -1 cannot be specified at compiletime
+  * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time
   * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
   * not allowed).
   *
+  * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1),
+  * the inner stride is the pointer increment between two consecutive elements,
+  * regardless of storage layout.
+  *
   * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
   */
 template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
diff --git a/Eigen/src/Core/Transpose.h b/Eigen/src/Core/Transpose.h
index 2bc658f40b88bd588d39457dc9b8f19deb0c14cf..741504d9571eef3c06029ddc1417332014331db5 100644
--- a/Eigen/src/Core/Transpose.h
+++ b/Eigen/src/Core/Transpose.h
@@ -178,7 +178,7 @@ template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-Transpose<Derived>
+typename DenseBase<Derived>::TransposeReturnType
 DenseBase<Derived>::transpose()
 {
   return TransposeReturnType(derived());
@@ -191,7 +191,7 @@ DenseBase<Derived>::transpose()
   * \sa transposeInPlace(), adjoint() */
 template<typename Derived>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-typename DenseBase<Derived>::ConstTransposeReturnType
+const typename DenseBase<Derived>::ConstTransposeReturnType
 DenseBase<Derived>::transpose() const
 {
   return ConstTransposeReturnType(derived());
diff --git a/Eigen/src/Core/TriangularMatrix.h b/Eigen/src/Core/TriangularMatrix.h
index 779152fa7344116d410017f81183894543864888..fdb8bc15a5b2c9ea0c133b26856a0cd091e3127a 100644
--- a/Eigen/src/Core/TriangularMatrix.h
+++ b/Eigen/src/Core/TriangularMatrix.h
@@ -53,7 +53,7 @@ template<typename Derived> class TriangularBase : public EigenBase<Derived>
     typedef Derived const& Nested;
 
     EIGEN_DEVICE_FUNC
-    inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
+    inline TriangularBase() { eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag)))); }
 
     EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR
     inline Index rows() const EIGEN_NOEXCEPT { return derived().rows(); }
@@ -819,7 +819,7 @@ void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src, con
   enum {
       unroll = DstXprType::SizeAtCompileTime != Dynamic
             && SrcEvaluatorType::CoeffReadCost < HugeCost
-            && DstXprType::SizeAtCompileTime * (DstEvaluatorType::CoeffReadCost+SrcEvaluatorType::CoeffReadCost) / 2 <= EIGEN_UNROLLING_LIMIT
+            && DstXprType::SizeAtCompileTime * (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <= EIGEN_UNROLLING_LIMIT
     };
 
   triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(kernel);
@@ -853,7 +853,7 @@ struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense>
 {
   EIGEN_DEVICE_FUNC static void run(DstXprType &dst, const SrcXprType &src, const Functor &func)
   {
-    call_triangular_assignment_loop<SrcXprType::Mode, (SrcXprType::Mode&SelfAdjoint)==0>(dst, src, func);
+    call_triangular_assignment_loop<SrcXprType::Mode, (int(SrcXprType::Mode) & int(SelfAdjoint)) == 0>(dst, src, func);
   }
 };
 
@@ -951,7 +951,7 @@ template<typename DenseDerived>
 EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
 {
   other.derived().resize(this->rows(), this->cols());
-  internal::call_triangular_assignment_loop<Derived::Mode,(Derived::Mode&SelfAdjoint)==0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
+  internal::call_triangular_assignment_loop<Derived::Mode, (int(Derived::Mode) & int(SelfAdjoint)) == 0 /* SetOpposite */>(other.derived(), derived().nestedExpression());
 }
 
 namespace internal {
diff --git a/Eigen/src/Core/Visitor.h b/Eigen/src/Core/Visitor.h
index 07a2e42433e2ad1d21ba8d94ecb8394809df65b0..00bcca87768eba2f48d31759a14a3c42beb3f6b1 100644
--- a/Eigen/src/Core/Visitor.h
+++ b/Eigen/src/Core/Visitor.h
@@ -124,7 +124,7 @@ void DenseBase<Derived>::visit(Visitor& visitor) const
 
   enum {
     unroll =  SizeAtCompileTime != Dynamic
-           && SizeAtCompileTime * ThisEvaluator::CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost <= EIGEN_UNROLLING_LIMIT
+           && SizeAtCompileTime * int(ThisEvaluator::CoeffReadCost) + (SizeAtCompileTime-1) * int(internal::functor_traits<Visitor>::Cost) <= EIGEN_UNROLLING_LIMIT
   };
   return internal::visitor_impl<Visitor, ThisEvaluator, unroll ? int(SizeAtCompileTime) : Dynamic>::run(thisEval, visitor);
 }
diff --git a/Eigen/src/Core/arch/AVX/Complex.h b/Eigen/src/Core/arch/AVX/Complex.h
index 506ca0be57157d639cc8287486aae97c88e2c89e..e9096c0a1ace3804067013864b170700433c1336 100644
--- a/Eigen/src/Core/arch/AVX/Complex.h
+++ b/Eigen/src/Core/arch/AVX/Complex.h
@@ -99,7 +99,9 @@ template<> EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<fl
 
 template<> EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from)
 {
-  return Packet4cf(_mm256_castpd_ps(_mm256_broadcast_sd((const double*)(const void*)&from)));
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from)
@@ -167,39 +169,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const P
                          Packet2cf(_mm256_extractf128_ps(a.v, 1))));
 }
 
-template<> struct conj_helper<Packet4cf, Packet4cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet4cf, Packet4cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet4cf& y, const Packet4cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f)
 
 template<> EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b)
@@ -350,39 +319,6 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const
                      Packet1cd(_mm256_extractf128_pd(a.v,1))));
 }
 
-template<> struct conj_helper<Packet2cd, Packet2cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cd, Packet2cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet2cd& y, const Packet2cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d)
 
 template<> EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b)
diff --git a/Eigen/src/Core/arch/AVX/PacketMath.h b/Eigen/src/Core/arch/AVX/PacketMath.h
index dd3f243d24ab3401b51aec8acca07de176c19478..24e01c46fa838a8ccc8e5dfbf4d94fbf03750395 100644
--- a/Eigen/src/Core/arch/AVX/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -285,11 +285,13 @@ template<> EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const
 
 template<> EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a)
 {
-  return _mm256_sub_ps(_mm256_set1_ps(0.0),a);
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+  return _mm256_xor_ps(a, mask);
 }
 template<> EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a)
 {
-  return _mm256_sub_pd(_mm256_set1_pd(0.0),a);
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
+  return _mm256_xor_pd(a, mask);
 }
 
 template<> EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) { return a; }
@@ -628,11 +630,23 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d&
 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*       to, const Packet8i& from) { EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from); }
 
 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*   to, const Packet8f& from, uint8_t umask) {
+#ifdef EIGEN_VECTORIZE_AVX512
+  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from));
+#else
   Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
-  const Packet8i bit_mask = _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  const Packet8i bit_mask = _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
   mask = por<Packet8i>(mask, bit_mask);
   mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
-  EIGEN_DEBUG_UNALIGNED_STORE return _mm256_maskstore_ps(to, mask, from);
+#if EIGEN_COMP_MSVC
+  // MSVC sometimes seems to use a bogus mask with maskstore.
+  const __m256i ifrom = _mm256_castps_si256(from);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0), reinterpret_cast<char*>(to));
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1), reinterpret_cast<char*>(to + 4));
+#else
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from);
+#endif
+#endif
 }
 
 // NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
@@ -1006,7 +1020,7 @@ EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
 
 EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
 #ifdef EIGEN_HAS_FP16_C
-  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
+  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT);
 #else
   EIGEN_ALIGN32 float aux[8];
   pstore(aux, a);
@@ -1274,12 +1288,7 @@ EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
 EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
   Packet8bf r;
 
-  // Flush input denormals value to zero with hardware capability.
-  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-  __m256 flush = _mm256_and_ps(a, a);
-  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
-
-  __m256i input = _mm256_castps_si256(flush);
+  __m256i input = _mm256_castps_si256(a);
 
 #ifdef EIGEN_VECTORIZE_AVX2
   // uint32_t lsb = (input >> 16);
@@ -1293,7 +1302,7 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
   // input = input >> 16;
   t = _mm256_srli_epi32(t, 16);
   // Check NaN before converting back to bf16
-  __m256 mask = _mm256_cmp_ps(flush, flush, _CMP_ORD_Q);
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
   __m256i nan = _mm256_set1_epi32(0x7fc0);
   t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
   // output = numext::bit_cast<uint16_t>(input);
@@ -1316,7 +1325,7 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
   lo = _mm_srli_epi32(lo, 16);
   hi = _mm_srli_epi32(hi, 16);
   // Check NaN before converting back to bf16
-  __m256 mask = _mm256_cmp_ps(flush, flush, _CMP_ORD_Q);
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
   __m128i nan = _mm_set1_epi32(0x7fc0);
   lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
   hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
diff --git a/Eigen/src/Core/arch/AVX512/Complex.h b/Eigen/src/Core/arch/AVX512/Complex.h
index 45f22f436f9ebaa7699cda57acd90555443a6fe7..0167d050e95434003d12bd1e56807d9d4fc63f0f 100644
--- a/Eigen/src/Core/arch/AVX512/Complex.h
+++ b/Eigen/src/Core/arch/AVX512/Complex.h
@@ -37,7 +37,7 @@ template<> struct packet_traits<std::complex<float> >  : default_packet_traits
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
-    HasSqrt   = 1,
+    HasSqrt   = EIGEN_HAS_AVX512_MATH,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -97,7 +97,9 @@ template<> EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<fl
 
 template<> EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from)
 {
-  return Packet8cf(_mm512_castpd_ps(pload1<Packet8d>((const double*)(const void*)&from)));
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re));
 }
 
 template<> EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from)
@@ -153,39 +155,6 @@ EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a)
   return Packet4cf(res);
 }
 
-template<> struct conj_helper<Packet8cf, Packet8cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet8cf, Packet8cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet8cf, Packet8cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet8cf pmadd(const Packet8cf& x, const Packet8cf& y, const Packet8cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet8cf pmul(const Packet8cf& a, const Packet8cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf,Packet16f)
 
 template<> EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b)
@@ -225,7 +194,7 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
     HasMul    = 1,
     HasDiv    = 1,
     HasNegate = 1,
-    HasSqrt   = 1,
+    HasSqrt   = EIGEN_HAS_AVX512_MATH,
     HasAbs    = 0,
     HasAbs2   = 0,
     HasMin    = 0,
@@ -286,11 +255,7 @@ template<> EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<do
 
 template<> EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from)
 {
-  #ifdef EIGEN_VECTORIZE_AVX512DQ
-  return Packet4cd(_mm512_broadcast_f64x2(pset1<Packet1cd>(from).v));
-  #else
   return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4( _mm_castpd_ps(pset1<Packet1cd>(from).v))));
-  #endif
 }
 
 template<> EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
@@ -441,6 +406,8 @@ ptranspose(PacketBlock<Packet4cd,4>& kernel) {
   kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0,2,0,2>::mask))); // [a0 b0 c0 d0]
 }
 
+#if EIGEN_HAS_AVX512_MATH
+
 template<> EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
   return psqrt_complex<Packet4cd>(a);
 }
@@ -449,6 +416,8 @@ template<> EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
   return psqrt_complex<Packet8cf>(a);
 }
 
+#endif
+
 } // end namespace internal
 } // end namespace Eigen
 
diff --git a/Eigen/src/Core/arch/AVX512/MathFunctions.h b/Eigen/src/Core/arch/AVX512/MathFunctions.h
index 41929cb348b5d207dd9a80e47e2decac7f6a1596..017d6bff00ca99891d6ddb1f3c6a6b3e629e7265 100644
--- a/Eigen/src/Core/arch/AVX512/MathFunctions.h
+++ b/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -14,8 +14,7 @@ namespace Eigen {
 
 namespace internal {
 
-// Disable the code for older versions of gcc that don't support many of the required avx512 instrinsics.
-#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG  || EIGEN_COMP_MSVC >= 1923
+#if EIGEN_HAS_AVX512_MATH
 
 #define _EIGEN_DECLARE_CONST_Packet16f(NAME, X) \
   const Packet16f p16f_##NAME = pset1<Packet16f>(X)
@@ -119,74 +118,11 @@ pexp<Packet16f>(const Packet16f& _x) {
   return pmax(pmul(y, _mm512_castsi512_ps(emm0)), _x);
 }
 
-/*template <>
+template <>
 EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d
 pexp<Packet8d>(const Packet8d& _x) {
-  Packet8d x = _x;
-
-  _EIGEN_DECLARE_CONST_Packet8d(1, 1.0);
-  _EIGEN_DECLARE_CONST_Packet8d(2, 2.0);
-
-  _EIGEN_DECLARE_CONST_Packet8d(exp_hi, 709.437);
-  _EIGEN_DECLARE_CONST_Packet8d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet8d(cephes_exp_C2, 1.42860682030941723212e-6);
-
-  // clamp x
-  x = pmax(pmin(x, p8d_exp_hi), p8d_exp_lo);
-
-  // Express exp(x) as exp(g + n*log(2)).
-  const Packet8d n =
-      _mm512_mul_round_pd(p8d_cephes_LOG2EF, x, _MM_FROUND_TO_NEAREST_INT);
-
-  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
-  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
-  // digits right.
-  const Packet8d nC1 = pmul(n, p8d_cephes_exp_C1);
-  const Packet8d nC2 = pmul(n, p8d_cephes_exp_C2);
-  x = psub(x, nC1);
-  x = psub(x, nC2);
-
-  const Packet8d x2 = pmul(x, x);
-
-  // Evaluate the numerator polynomial of the rational interpolant.
-  Packet8d px = p8d_cephes_exp_p0;
-  px = pmadd(px, x2, p8d_cephes_exp_p1);
-  px = pmadd(px, x2, p8d_cephes_exp_p2);
-  px = pmul(px, x);
-
-  // Evaluate the denominator polynomial of the rational interpolant.
-  Packet8d qx = p8d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p8d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p8d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p8d_cephes_exp_q3);
-
-  // I don't really get this bit, copied from the SSE2 routines, so...
-  // TODO(gonnet): Figure out what is going on here, perhaps find a better
-  // rational interpolant?
-  x = _mm512_div_pd(px, psub(qx, px));
-  x = pmadd(p8d_2, x, p8d_1);
-
-  // Build e=2^n.
-  const Packet8d e = _mm512_castsi512_pd(_mm512_slli_epi64(
-      _mm512_add_epi64(_mm512_cvtpd_epi64(n), _mm512_set1_epi64(1023)), 52));
-
-  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
-  // non-finite values in the input.
-  return pmax(pmul(x, e), _x);
-  }*/
+  return pexp_double(_x);
+}
 
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
@@ -389,7 +325,7 @@ Packet16f pexpm1<Packet16f>(const Packet16f& _x) {
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
 
-#endif
+#endif  // EIGEN_HAS_AVX512_MATH
 
 
 template <>
diff --git a/Eigen/src/Core/arch/AVX512/PacketMath.h b/Eigen/src/Core/arch/AVX512/PacketMath.h
index f8741372d9aa47a01e2320911b8ae7913194b797..4ab100cec1815cf29041cfe7f251cc83a2960ad3 100644
--- a/Eigen/src/Core/arch/AVX512/PacketMath.h
+++ b/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -28,6 +28,13 @@ namespace internal {
 #endif
 #endif
 
+// Disable the code for older versions of gcc that don't support many of the required avx512 math instrinsics.
+#if EIGEN_GNUC_AT_LEAST(5, 3) || EIGEN_COMP_CLANG || EIGEN_COMP_MSVC >= 1923 || EIGEN_COMP_ICC >= 1900
+#define EIGEN_HAS_AVX512_MATH 1
+#else
+#define EIGEN_HAS_AVX512_MATH 0
+#endif
+
 typedef __m512 Packet16f;
 typedef __m512i Packet16i;
 typedef __m512d Packet8d;
@@ -72,12 +79,14 @@ struct packet_traits<half> : default_packet_traits {
     HasMax    = 1,
     HasConj   = 1,
     HasSetLinear = 0,
-    HasLog    = 1,
-    HasLog1p  = 1,
-    HasExpm1  = 1,
-    HasExp    = 1,
-    HasSqrt   = 1,
-    HasRsqrt  = 1,
+    HasLog    = EIGEN_HAS_AVX512_MATH,
+    HasLog1p  = EIGEN_HAS_AVX512_MATH,
+    HasExp    = EIGEN_HAS_AVX512_MATH,
+    HasExpm1  = EIGEN_HAS_AVX512_MATH,
+    HasSqrt   = EIGEN_HAS_AVX512_MATH,
+    HasRsqrt  = EIGEN_HAS_AVX512_MATH,
+    HasBessel = EIGEN_HAS_AVX512_MATH,
+    HasNdtri  = EIGEN_HAS_AVX512_MATH,
     HasSin    = EIGEN_FAST_MATH,
     HasCos    = EIGEN_FAST_MATH,
     HasTanh   = EIGEN_FAST_MATH,
@@ -86,9 +95,7 @@ struct packet_traits<half> : default_packet_traits {
     HasRound  = 1,
     HasFloor  = 1,
     HasCeil   = 1,
-    HasRint   = 1,
-    HasBessel = 1,
-    HasNdtri  = 1
+    HasRint   = 1
   };
 };
 
@@ -109,7 +116,7 @@ template<> struct packet_traits<float>  : default_packet_traits
     HasBlend = 0,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+#if EIGEN_HAS_AVX512_MATH
     HasLog = 1,
     HasLog1p  = 1,
     HasExpm1  = 1,
@@ -138,8 +145,9 @@ template<> struct packet_traits<double> : default_packet_traits
     AlignedOnScalar = 1,
     size = 8,
     HasHalfPacket = 1,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+#if EIGEN_HAS_AVX512_MATH
     HasLog  = 1,
+    HasExp = 1,
     HasSqrt = EIGEN_FAST_MATH,
     HasRsqrt = EIGEN_FAST_MATH,
 #endif
@@ -288,11 +296,20 @@ EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a,
 
 template <>
 EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
-  return _mm512_sub_ps(_mm512_set1_ps(0.0), a);
+  // NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results.
+  //       The intel docs give it a relatively high latency as well, so we're probably
+  //       better off with using _mm512_set_epi32 directly anyways.
+  const __m512i mask = _mm512_set_epi32(0x80000000,0x80000000,0x80000000,0x80000000,
+                                        0x80000000,0x80000000,0x80000000,0x80000000,
+                                        0x80000000,0x80000000,0x80000000,0x80000000,
+                                        0x80000000,0x80000000,0x80000000,0x80000000);
+  return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
-  return _mm512_sub_pd(_mm512_set1_pd(0.0), a);
+  const __m512i mask = _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
+                                        0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
+  return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
 }
 
 template <>
@@ -486,7 +503,7 @@ template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packe
 }
 
 template<> EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
-  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGT_UQ);
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
   return _mm512_castsi512_ps(
       _mm512_mask_set1_epi32(_mm512_set1_epi32(0), mask, 0xffffffffu));
 }
@@ -517,7 +534,7 @@ EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
 }
 template <>
 EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
-  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGT_UQ);
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
   return _mm512_castsi512_pd(
       _mm512_mask_set1_epi64(_mm512_set1_epi64(0), mask, 0xffffffffffffffffu));
 }
@@ -685,7 +702,7 @@ EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) {
 template <>
 EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
   EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_si512(
-      reinterpret_cast<const __m512i*>(from));
+    reinterpret_cast<const __m512i*>(from));
 }
 
 template <>
@@ -929,7 +946,8 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, cons
   Packet8i b = parithmetic_shift_right<2>(e);  // floor(e/4)
   
   // 2^b
-  Packet8i hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0));
+  const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
   Packet8i lo = _mm256_slli_epi64(hi, 52);
   hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
   Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
@@ -937,7 +955,7 @@ template<> EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, cons
   
   // 2^(e - 3b)
   b = psub(psub(psub(e, b), b), b);  // e - 3b
-  hi = _mm256_shuffle_epi32(padd(b, bias), _MM_SHUFFLE(3, 1, 2, 0));
+  hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
   lo = _mm256_slli_epi64(hi, 52);
   hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
   c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
@@ -1424,60 +1442,11 @@ ploadquad(const Eigen::half* from) {
 }
 
 EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
-#ifdef EIGEN_HAS_FP16_C
   return _mm512_cvtph_ps(a);
-#else
-  EIGEN_ALIGN64 half aux[16];
-  pstore(aux, a);
-  float f0(aux[0]);
-  float f1(aux[1]);
-  float f2(aux[2]);
-  float f3(aux[3]);
-  float f4(aux[4]);
-  float f5(aux[5]);
-  float f6(aux[6]);
-  float f7(aux[7]);
-  float f8(aux[8]);
-  float f9(aux[9]);
-  float fa(aux[10]);
-  float fb(aux[11]);
-  float fc(aux[12]);
-  float fd(aux[13]);
-  float fe(aux[14]);
-  float ff(aux[15]);
-
-  return _mm512_set_ps(
-      ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
-#endif
 }
 
 EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
-#ifdef EIGEN_HAS_FP16_C
   return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT|_MM_FROUND_NO_EXC);
-#else
-  EIGEN_ALIGN64 float aux[16];
-  pstore(aux, a);
-  half h0(aux[0]);
-  half h1(aux[1]);
-  half h2(aux[2]);
-  half h3(aux[3]);
-  half h4(aux[4]);
-  half h5(aux[5]);
-  half h6(aux[6]);
-  half h7(aux[7]);
-  half h8(aux[8]);
-  half h9(aux[9]);
-  half ha(aux[10]);
-  half hb(aux[11]);
-  half hc(aux[12]);
-  half hd(aux[13]);
-  half he(aux[14]);
-  half hf(aux[15]);
-
-  return _mm256_set_epi16(
-      hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
-      h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
-#endif
 }
 
 template<> EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
@@ -1850,7 +1819,7 @@ struct packet_traits<bfloat16> : default_packet_traits {
     HasInsert = 1,
     HasSin = EIGEN_FAST_MATH,
     HasCos = EIGEN_FAST_MATH,
-#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT)
+#if EIGEN_HAS_AVX512_MATH
 #ifdef EIGEN_VECTORIZE_AVX512DQ
     HasLog = 1,  // Currently fails test with bad accuracy.
     HasLog1p  = 1,
@@ -1943,23 +1912,15 @@ EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) {
 EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
   Packet16bf r;
 
-  // Flush input denormals value to zero with hardware capability.
-  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
-#if defined(EIGEN_VECTORIZE_AVX512DQ)
-  __m512 flush = _mm512_and_ps(a, a);
-#else
-  __m512 flush = _mm512_max_ps(a, a);
-#endif // EIGEN_VECTORIZE_AVX512DQ
-  _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_OFF);
-
 #if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_AT_LEAST(10, 1)
   // Since GCC 10.1 supports avx512bf16 and C style explicit cast
   // (C++ static_cast is not supported yet), do converion via intrinsic
   // and register path for performance.
-  r = (__m256i)(_mm512_cvtneps_pbh(flush));
+  r = (__m256i)(_mm512_cvtneps_pbh(a));
+
 #else
   __m512i t;
-  __m512i input = _mm512_castps_si512(flush);
+  __m512i input = _mm512_castps_si512(a);
   __m512i nan = _mm512_set1_epi32(0x7fc0);
 
   // uint32_t lsb = (input >> 16) & 1;
@@ -1972,9 +1933,9 @@ EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
   t = _mm512_srli_epi32(t, 16);
 
   // Check NaN before converting back to bf16
-  __mmask16 mask = _mm512_cmp_ps_mask(flush, flush, _CMP_ORD_Q);
-  t = _mm512_mask_blend_epi32(mask, nan, t);
+  __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
 
+  t = _mm512_mask_blend_epi32(mask, nan, t);
   // output.value = static_cast<uint16_t>(input);
   r = _mm512_cvtepi32_epi16(t);
 #endif // EIGEN_VECTORIZE_AVX512BF16
diff --git a/Eigen/src/Core/arch/AltiVec/Complex.h b/Eigen/src/Core/arch/AltiVec/Complex.h
index c6cb59e8f1c3ec4dcb37d0fd1cdb4c4428ab1c93..b3932998c1bc6b7383dc99ef75049409b6ee3c0d 100644
--- a/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -74,7 +74,7 @@ struct Packet2cf
     return Packet2cf(*this) -= b;
   }
   EIGEN_STRONG_INLINE Packet2cf operator-(void) const {
-    return Packet2cf(vec_neg(v));
+    return Packet2cf(-v);
   }
 
   Packet4f  v;
@@ -127,20 +127,20 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<
 template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstore((float*)to, from.v); }
 template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { pstoreu((float*)to, from.v); }
 
-EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>* from0, const std::complex<float>* from1)
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1)
 {
   Packet4f res0, res1;
 #ifdef __VSX__
-  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (*from0));
-  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (*from1));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res0) : "Z" (from0));
+  __asm__ ("lxsdx %x0,%y1" : "=wa" (res1) : "Z" (from1));
 #ifdef _BIG_ENDIAN
   __asm__ ("xxpermdi %x0, %x1, %x2, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
 #else
   __asm__ ("xxpermdi %x0, %x2, %x1, 0" : "=wa" (res0) : "wa" (res0), "wa" (res1));
 #endif
 #else
-  *reinterpret_cast<std::complex<float> *>(&res0) = *from0;
-  *reinterpret_cast<std::complex<float> *>(&res1) = *from1;
+  *reinterpret_cast<std::complex<float> *>(&res0) = from0;
+  *reinterpret_cast<std::complex<float> *>(&res1) = from1;
   res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
 #endif
   return Packet2cf(res0);
@@ -206,45 +206,12 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   return pfirst<Packet2cf>(prod);
 }
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
+  Packet2cf res = pmul(a, pconj(b));
   Packet4f s = pmul<Packet4f>(b.v, b.v);
   return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
@@ -327,7 +294,7 @@ struct Packet1cd
     return Packet1cd(*this) -= b;
   }
   EIGEN_STRONG_INLINE Packet1cd operator-(void) const {
-    return Packet1cd(vec_neg(v));
+    return Packet1cd(-v);
   }
 
   Packet2d v;
@@ -404,45 +371,12 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Pack
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for AltiVec
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet1cd res = pmul(a,pconj(b));
   Packet2d s = pmul<Packet2d>(b.v, b.v);
   return Packet1cd(pdiv(res.v, padd<Packet2d>(s, vec_perm(s, s, p16uc_REVERSE64))));
 }
diff --git a/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
index 3a7a329361eea0cd6ba25b81d663a8409ed70384..2b7c204e3fd37230c62b3dacf261e47313ff6d8d 100644
--- a/Eigen/src/Core/arch/AltiVec/MathFunctions.h
+++ b/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -40,24 +40,9 @@ Packet4f pcos<Packet4f>(const Packet4f& _x)
   return pcos_float(_x);
 }
 
-#ifndef EIGEN_COMP_CLANG
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f prsqrt<Packet4f>(const Packet4f& x)
-{
-  return  vec_rsqrt(x);
-}
-#endif
-
 #ifdef __VSX__
-#ifndef EIGEN_COMP_CLANG
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d prsqrt<Packet2d>(const Packet2d& x)
-{
-  return  vec_rsqrt(x);
-}
-#endif
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet4f psqrt<Packet4f>(const Packet4f& x)
 {
   return  vec_sqrt(x);
@@ -69,12 +54,41 @@ Packet2d psqrt<Packet2d>(const Packet2d& x)
   return  vec_sqrt(x);
 }
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet4f prsqrt<Packet4f>(const Packet4f& x)
+{
+  return pset1<Packet4f>(1.0f) / psqrt<Packet4f>(x);
+//  vec_rsqrt returns different results from the generic version
+//  return  vec_rsqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet2d prsqrt<Packet2d>(const Packet2d& x)
+{
+  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
+//  vec_rsqrt returns different results from the generic version
+//  return  vec_rsqrt(x);
+}
+
+template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
 Packet2d pexp<Packet2d>(const Packet2d& _x)
 {
   return pexp_double(_x);
 }
-#endif
+
+template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
+}
+
+template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
+  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
+}
+
+#endif  // __VSX__
 
 // Hyperbolic Tangent function.
 template <>
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
index e3ba06159ff2246f0d7ca3bdc328b828c0526f37..ea7749610d7ccfee948e9b111476c60db9b6a7cc 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -11,26 +11,41 @@
 #ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
 #define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
 
+#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
+#define EIGEN_ALTIVEC_USE_CUSTOM_PACK    1
+#endif
+
 #include "MatrixProductCommon.h"
 
-// Since LLVM doesn't support dynamic dispatching, force either always MMA or VSX
-#if EIGEN_COMP_LLVM
-#if !defined(EIGEN_ALTIVEC_DISABLE_MMA) && !defined(EIGEN_ALTIVEC_MMA_ONLY)
-#ifdef __MMA__
-#define EIGEN_ALTIVEC_MMA_ONLY
-#else
-#define EIGEN_ALTIVEC_DISABLE_MMA
+#if !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+#define EIGEN_ALTIVEC_DISABLE_MMA 0
 #endif
+
+// Check for MMA builtin support. 
+#if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin)
+#if __has_builtin(__builtin_mma_assemble_acc)
+  #define EIGEN_ALTIVEC_MMA_SUPPORT
 #endif
 #endif
 
-#ifdef __has_builtin
-#if __has_builtin(__builtin_mma_assemble_acc)
-  #define ALTIVEC_MMA_SUPPORT
+// Check if and how we should actually use MMA if supported.
+#if defined(EIGEN_ALTIVEC_MMA_SUPPORT)
+
+#if !defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH)
+#define EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH 0
 #endif
+
+// Check if we want to enable dynamic dispatch. Not supported by LLVM.
+#if EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH && !EIGEN_COMP_LLVM
+#define EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH 1
+// Otherwise, use MMA by default if available.
+#elif defined(__MMA__)
+#define EIGEN_ALTIVEC_MMA_ONLY 1
 #endif
 
-#if defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+#endif // EIGEN_ALTIVEC_MMA_SUPPORT
+
+#if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
   #include "MatrixProductMMA.h"
 #endif
 
@@ -113,7 +128,7 @@ const static Packet16uc p16uc_GETIMAG64 = {  8,  9, 10, 11, 12, 13, 14, 15,
  * float32/64 and complex float32/64 version.
  **/
 template<typename Scalar, typename Index, int StorageOrder>
-EIGEN_STRONG_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
+EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt)
 {
   std::complex<Scalar> v;
   if(i < j)
@@ -160,24 +175,23 @@ EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* bloc
 
     rir += vectorDelta;
   }
-  if (j < cols)
+
+  for(; j < cols; j++)
   {
-    rii = rir + ((cols - j) * rows);
+    rii = rir + rows;
 
     for(Index i = k2; i < depth; i++)
     {
-      Index k = j;
-      for(; k < cols; k++)
-      {
-        std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, k, rhs);
+      std::complex<Scalar> v = getAdjointVal<Scalar, Index, StorageOrder>(i, j, rhs);
 
-        blockBf[rir] = v.real();
-        blockBf[rii] = v.imag();
+      blockBf[rir] = v.real();
+      blockBf[rii] = v.imag();
 
-        rir += 1;
-        rii += 1;
-      }
+      rir += 1;
+      rii += 1;
     }
+
+    rir += rows;
   }
 }
 
@@ -256,19 +270,15 @@ EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs
     }
   }
 
-  if (j < cols)
+  for(; j < cols; j++)
   {
     for(Index i = k2; i < depth; i++)
     {
-      Index k = j;
-      for(; k < cols; k++)
-      {
-        if(k <= i)
-          blockB[ri] = rhs(i, k);
-        else
-          blockB[ri] = rhs(k, i);
-        ri += 1;
-      }
+      if(j <= i)
+        blockB[ri] = rhs(i, j);
+      else
+        blockB[ri] = rhs(j, i);
+      ri += 1;
     }
   }
 }
@@ -402,22 +412,18 @@ struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder>
  * and offset and behaves accordingly.
  **/
 
-template<typename Scalar, typename Packet, typename Index>
-EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,4>& block)
-{
-  const Index size = 16 / sizeof(Scalar);
-  pstore<Scalar>(to + (0 * size), block.packet[0]);
-  pstore<Scalar>(to + (1 * size), block.packet[1]);
-  pstore<Scalar>(to + (2 * size), block.packet[2]);
-  pstore<Scalar>(to + (3 * size), block.packet[3]);
-}
-
-template<typename Scalar, typename Packet, typename Index>
-EIGEN_STRONG_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,2>& block)
+template<typename Scalar, typename Packet, typename Index, int N>
+EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet,N>& block)
 {
   const Index size = 16 / sizeof(Scalar);
   pstore<Scalar>(to + (0 * size), block.packet[0]);
   pstore<Scalar>(to + (1 * size), block.packet[1]);
+  if (N > 2) {
+    pstore<Scalar>(to + (2 * size), block.packet[2]);
+  }
+  if (N > 3) {
+    pstore<Scalar>(to + (3 * size), block.packet[3]);
+  }
 }
 
 // General template for lhs & rhs complex packing.
@@ -443,9 +449,9 @@ struct dhs_cpack {
         PacketBlock<PacketC,8> cblock;
 
         if (UseLhs) {
-          bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, j, i);
+          bload<DataMapper, PacketC, Index, 2, StorageOrder, true, 4>(cblock, lhs, j, i);
         } else {
-          bload<DataMapper, PacketC, Index, 2, 0, StorageOrder>(cblock, lhs, i, j);
+          bload<DataMapper, PacketC, Index, 2, StorageOrder, true, 4>(cblock, lhs, i, j);
         }
 
         blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, p16uc_GETREAL32);
@@ -472,8 +478,8 @@ struct dhs_cpack {
           ptranspose(blocki);
         }
 
-        storeBlock<Scalar, Packet, Index>(blockAt + rir, blockr);
-        storeBlock<Scalar, Packet, Index>(blockAt + rii, blocki);
+        storeBlock<Scalar, Packet, Index, 4>(blockAt + rir, blockr);
+        storeBlock<Scalar, Packet, Index, 4>(blockAt + rii, blocki);
 
         rir += 4*vectorSize;
         rii += 4*vectorSize;
@@ -493,21 +499,12 @@ struct dhs_cpack {
             cblock.packet[1] = lhs.template loadPacket<PacketC>(i, j + 2);
           }
         } else {
-          const std::complex<Scalar> *lhs0, *lhs1;
           if (UseLhs) {
-            lhs0 = &lhs(j + 0, i);
-            lhs1 = &lhs(j + 1, i);
-            cblock.packet[0] = pload2(lhs0, lhs1);
-            lhs0 = &lhs(j + 2, i);
-            lhs1 = &lhs(j + 3, i);
-            cblock.packet[1] = pload2(lhs0, lhs1);
+            cblock.packet[0] = pload2(lhs(j + 0, i), lhs(j + 1, i));
+            cblock.packet[1] = pload2(lhs(j + 2, i), lhs(j + 3, i));
           } else {
-            lhs0 = &lhs(i, j + 0);
-            lhs1 = &lhs(i, j + 1);
-            cblock.packet[0] = pload2(lhs0, lhs1);
-            lhs0 = &lhs(i, j + 2);
-            lhs1 = &lhs(i, j + 3);
-            cblock.packet[1] = pload2(lhs0, lhs1);
+            cblock.packet[0] = pload2(lhs(i, j + 0), lhs(i, j + 1));
+            cblock.packet[1] = pload2(lhs(i, j + 2), lhs(i, j + 3));
           }
         }
 
@@ -529,34 +526,50 @@ struct dhs_cpack {
       rir += ((PanelMode) ? (vectorSize*(2*stride - depth)) : vectorDelta);
     }
 
-    if (j < rows)
+    if (!UseLhs)
     {
-      if(PanelMode) rir += (offset*(rows - j - vectorSize));
-      rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+      if(PanelMode) rir -= (offset*(vectorSize - 1));
 
-      for(Index i = 0; i < depth; i++)
+      for(; j < rows; j++)
       {
-        Index k = j;
-        for(; k < rows; k++)
+        rii = rir + ((PanelMode) ? stride : depth);
+
+        for(Index i = 0; i < depth; i++)
         {
-          if (UseLhs) {
+          blockAt[rir] = lhs(i, j).real();
+
+          if(Conjugate)
+            blockAt[rii] = -lhs(i, j).imag();
+          else
+            blockAt[rii] =  lhs(i, j).imag();
+
+          rir += 1;
+          rii += 1;
+        }
+
+        rir += ((PanelMode) ? (2*stride - depth) : depth);
+      }
+    } else {
+      if (j < rows)
+      {
+        if(PanelMode) rir += (offset*(rows - j - vectorSize));
+        rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+
+        for(Index i = 0; i < depth; i++)
+        {
+          Index k = j;
+          for(; k < rows; k++)
+          {
             blockAt[rir] = lhs(k, i).real();
 
             if(Conjugate)
               blockAt[rii] = -lhs(k, i).imag();
             else
               blockAt[rii] =  lhs(k, i).imag();
-          } else {
-            blockAt[rir] = lhs(i, k).real();
 
-            if(Conjugate)
-              blockAt[rii] = -lhs(i, k).imag();
-            else
-              blockAt[rii] =  lhs(i, k).imag();
+            rir += 1;
+            rii += 1;
           }
-
-          rir += 1;
-          rii += 1;
         }
       }
     }
@@ -582,16 +595,16 @@ struct dhs_pack{
         PacketBlock<Packet,4> block;
 
         if (UseLhs) {
-          bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, j, i);
+          bload<DataMapper, Packet, Index, 4, StorageOrder, false, 4>(block, lhs, j, i);
         } else {
-          bload<DataMapper, Packet, Index, 4, 0, StorageOrder>(block, lhs, i, j);
+          bload<DataMapper, Packet, Index, 4, StorageOrder, false, 4>(block, lhs, i, j);
         }
         if(((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs))
         {
           ptranspose(block);
         }
 
-        storeBlock<Scalar, Packet, Index>(blockA + ri, block);
+        storeBlock<Scalar, Packet, Index, 4>(blockA + ri, block);
 
         ri += 4*vectorSize;
       }
@@ -626,21 +639,33 @@ struct dhs_pack{
       if(PanelMode) ri += vectorSize*(stride - offset - depth);
     }
 
-    if (j < rows)
+    if (!UseLhs)
     {
-      if(PanelMode) ri += offset*(rows - j);
+      if(PanelMode) ri += offset;
 
-      for(Index i = 0; i < depth; i++)
+      for(; j < rows; j++)
       {
-        Index k = j;
-        for(; k < rows; k++)
+        for(Index i = 0; i < depth; i++)
         {
-          if (UseLhs) {
+          blockA[ri] = lhs(i, j);
+          ri += 1;
+        }
+
+        if(PanelMode) ri += stride - depth;
+      }
+    } else {
+      if (j < rows)
+      {
+        if(PanelMode) ri += offset*(rows - j);
+
+        for(Index i = 0; i < depth; i++)
+        {
+          Index k = j;
+          for(; k < rows; k++)
+          {
             blockA[ri] = lhs(k, i);
-          } else {
-            blockA[ri] = lhs(i, k);
+            ri += 1;
           }
-          ri += 1;
         }
       }
     }
@@ -676,7 +701,7 @@ struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, tr
           block.packet[1] = lhs.template loadPacket<Packet2d>(j, i + 1);
         }
 
-        storeBlock<double, Packet2d, Index>(blockA + ri, block);
+        storeBlock<double, Packet2d, Index, 2>(blockA + ri, block);
 
         ri += 2*vectorSize;
       }
@@ -753,7 +778,7 @@ struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, fa
           block.packet[2] = rhs.template loadPacket<Packet2d>(i + 1, j + 0); //[b1 b2]
           block.packet[3] = rhs.template loadPacket<Packet2d>(i + 1, j + 2); //[b3 b4]
 
-          storeBlock<double, Packet2d, Index>(blockB + ri, block);
+          storeBlock<double, Packet2d, Index, 4>(blockB + ri, block);
         }
 
         ri += 4*vectorSize;
@@ -784,19 +809,17 @@ struct dhs_pack<double, Index, DataMapper, Packet2d, StorageOrder, PanelMode, fa
       if(PanelMode) ri += (2*vectorSize)*(stride - offset - depth);
     }
 
-    if (j < cols)
-    {
-      if(PanelMode) ri += offset*(cols - j);
+    if(PanelMode) ri += offset;
 
+    for(; j < cols; j++)
+    {
       for(Index i = 0; i < depth; i++)
       {
-        Index k = j;
-        for(; k < cols; k++)
-        {
-          blockB[ri] = rhs(i, k);
-          ri += 1;
-        }
+        blockB[ri] = rhs(i, j);
+        ri += 1;
       }
+
+      if(PanelMode) ri += stride - depth;
     }
   }
 };
@@ -857,8 +880,8 @@ struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conju
           blocki.packet[1] = -blocki.packet[1];
         }
 
-        storeBlock<double, Packet, Index>(blockAt + rir, blockr);
-        storeBlock<double, Packet, Index>(blockAt + rii, blocki);
+        storeBlock<double, Packet, Index, 2>(blockAt + rir, blockr);
+        storeBlock<double, Packet, Index, 2>(blockAt + rii, blocki);
 
         rir += 2*vectorSize;
         rii += 2*vectorSize;
@@ -937,7 +960,7 @@ struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conju
         PacketBlock<PacketC,4> cblock;
         PacketBlock<Packet,2> blockr, blocki;
 
-        bload<DataMapper, PacketC, Index, 2, 0, ColMajor>(cblock, rhs, i, j);
+        bload<DataMapper, PacketC, Index, 2, ColMajor, false, 4>(cblock, rhs, i, j);
 
         blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL64);
         blockr.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, p16uc_GETREAL64);
@@ -951,8 +974,8 @@ struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conju
           blocki.packet[1] = -blocki.packet[1];
         }
 
-        storeBlock<double, Packet, Index>(blockBt + rir, blockr);
-        storeBlock<double, Packet, Index>(blockBt + rii, blocki);
+        storeBlock<double, Packet, Index, 2>(blockBt + rir, blockr);
+        storeBlock<double, Packet, Index, 2>(blockBt + rii, blocki);
 
         rir += 2*vectorSize;
         rii += 2*vectorSize;
@@ -961,27 +984,26 @@ struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conju
       rir += ((PanelMode) ? (2*vectorSize*(2*stride - depth)) : vectorDelta);
     }
 
-    if (j < cols)
+    if(PanelMode) rir -= (offset*(2*vectorSize - 1));
+
+    for(; j < cols; j++)
     {
-      if(PanelMode) rir += (offset*(cols - j - 2*vectorSize));
-      rii = rir + (((PanelMode) ? stride : depth) * (cols - j));
+      rii = rir + ((PanelMode) ? stride : depth);
 
       for(Index i = 0; i < depth; i++)
       {
-        Index k = j;
-        for(; k < cols; k++)
-        {
-          blockBt[rir] = rhs(i, k).real();
+        blockBt[rir] = rhs(i, j).real();
 
-          if(Conjugate)
-            blockBt[rii] = -rhs(i, k).imag();
-          else
-            blockBt[rii] =  rhs(i, k).imag();
+        if(Conjugate)
+          blockBt[rii] = -rhs(i, j).imag();
+        else
+          blockBt[rii] =  rhs(i, j).imag();
 
-          rir += 1;
-          rii += 1;
-        }
+        rir += 1;
+        rii += 1;
       }
+
+      rir += ((PanelMode) ? (2*stride - depth) : depth);
     }
   }
 };
@@ -991,44 +1013,45 @@ struct dhs_cpack<double, Index, DataMapper, Packet, PacketC, StorageOrder, Conju
  **************/
 
 // 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
-template<typename Packet, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger_common(PacketBlock<Packet,4>* acc, const Packet& lhsV, const Packet* rhsV)
-{
-  if(NegativeAccumulate)
-  {
-    acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
-    acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
-    acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
-    acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
-  } else {
-    acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
-    acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
-    acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
-    acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
-  }
-}
-
-template<typename Packet, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger_common(PacketBlock<Packet,1>* acc, const Packet& lhsV, const Packet* rhsV)
+template<typename Packet, bool NegativeAccumulate, int N>
+EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet,N>* acc, const Packet& lhsV, const Packet* rhsV)
 {
   if(NegativeAccumulate)
   {
     acc->packet[0] = vec_nmsub(lhsV, rhsV[0], acc->packet[0]);
+    if (N > 1) {
+      acc->packet[1] = vec_nmsub(lhsV, rhsV[1], acc->packet[1]);
+    }
+    if (N > 2) {
+      acc->packet[2] = vec_nmsub(lhsV, rhsV[2], acc->packet[2]);
+    }
+    if (N > 3) {
+      acc->packet[3] = vec_nmsub(lhsV, rhsV[3], acc->packet[3]);
+    }
   } else {
     acc->packet[0] = vec_madd(lhsV, rhsV[0], acc->packet[0]);
+    if (N > 1) {
+      acc->packet[1] = vec_madd(lhsV, rhsV[1], acc->packet[1]);
+    }
+    if (N > 2) {
+      acc->packet[2] = vec_madd(lhsV, rhsV[2], acc->packet[2]);
+    }
+    if (N > 3) {
+      acc->packet[3] = vec_madd(lhsV, rhsV[3], acc->packet[3]);
+    }
   }
 }
 
 template<int N, typename Scalar, typename Packet, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
+EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
 {
   Packet lhsV = pload<Packet>(lhs);
 
-  pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
+  pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
 }
 
-template<typename Scalar, typename Packet, typename Index>
-EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, Index remaining_rows)
+template<typename Scalar, typename Packet, typename Index, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV)
 {
 #ifdef _ARCH_PWR9
   lhsV = vec_xl_len((Scalar *)lhs, remaining_rows * sizeof(Scalar));
@@ -1040,37 +1063,37 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs, Packet &lhsV, In
 #endif
 }
 
-template<int N, typename Scalar, typename Packet, typename Index, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV, Index remaining_rows)
+template<int N, typename Scalar, typename Packet, typename Index, bool NegativeAccumulate, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet,N>* acc, const Scalar* lhs, const Packet* rhsV)
 {
   Packet lhsV;
-  loadPacketRemaining<Scalar, Packet, Index>(lhs, lhsV, remaining_rows);
+  loadPacketRemaining<Scalar, Packet, Index, remaining_rows>(lhs, lhsV);
 
-  pger_common<Packet, NegativeAccumulate>(acc, lhsV, rhsV);
+  pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
 }
 
 // 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types real * complex and complex * real.
 template<int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi)
+EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Packet &lhsV, const Packet &lhsVi, const Packet* rhsV, const Packet* rhsVi)
 {
-  pger_common<Packet, false>(accReal, lhsV, rhsV);
+  pger_common<Packet, false, N>(accReal, lhsV, rhsV);
   if(LhsIsReal)
   {
-    pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
+    pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
     EIGEN_UNUSED_VARIABLE(lhsVi);
   } else {
     if (!RhsIsReal) {
-      pger_common<Packet, ConjugateLhs == ConjugateRhs>(accReal, lhsVi, rhsVi);
-      pger_common<Packet, ConjugateRhs>(accImag, lhsV, rhsVi);
+      pger_common<Packet, ConjugateLhs == ConjugateRhs, N>(accReal, lhsVi, rhsVi);
+      pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
     } else {
       EIGEN_UNUSED_VARIABLE(rhsVi);
     }
-    pger_common<Packet, ConjugateLhs>(accImag, lhsVi, rhsV);
+    pger_common<Packet, ConjugateLhs, N>(accImag, lhsVi, rhsV);
   }
 }
 
 template<int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
+EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
 {
   Packet lhsV = ploadLhs<Scalar, Packet>(lhs_ptr);
   Packet lhsVi;
@@ -1080,8 +1103,8 @@ EIGEN_STRONG_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packe
   pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
 }
 
-template<typename Scalar, typename Packet, typename Index, bool LhsIsReal>
-EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi, Index remaining_rows)
+template<typename Scalar, typename Packet, typename Index, bool LhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, Packet &lhsV, Packet &lhsVi)
 {
 #ifdef _ARCH_PWR9
   lhsV = vec_xl_len((Scalar *)lhs_ptr, remaining_rows * sizeof(Scalar));
@@ -1097,148 +1120,158 @@ EIGEN_STRONG_INLINE void loadPacketRemaining(const Scalar* lhs_ptr, const Scalar
 #endif
 }
 
-template<int N, typename Scalar, typename Packet, typename Index, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi, Index remaining_rows)
+template<int N, typename Scalar, typename Packet, typename Index, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet,N>* accReal, PacketBlock<Packet,N>* accImag, const Scalar* lhs_ptr, const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi)
 {
   Packet lhsV, lhsVi;
-  loadPacketRemaining<Scalar, Packet, Index, LhsIsReal>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi, remaining_rows);
+  loadPacketRemaining<Scalar, Packet, Index, LhsIsReal, remaining_rows>(lhs_ptr, lhs_ptr_imag, lhsV, lhsVi);
 
   pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
 }
 
 template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs)
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs)
 {
-  return *reinterpret_cast<Packet *>(const_cast<Scalar *>(lhs));
+  return ploadu<Packet>(lhs);
 }
 
 // Zero the accumulator on PacketBlock.
-template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE void bsetzero(PacketBlock<Packet,4>& acc)
-{
-  acc.packet[0] = pset1<Packet>((Scalar)0);
-  acc.packet[1] = pset1<Packet>((Scalar)0);
-  acc.packet[2] = pset1<Packet>((Scalar)0);
-  acc.packet[3] = pset1<Packet>((Scalar)0);
-}
-
-template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE void bsetzero(PacketBlock<Packet,1>& acc)
+template<typename Scalar, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet,N>& acc)
 {
   acc.packet[0] = pset1<Packet>((Scalar)0);
+  if (N > 1) {
+    acc.packet[1] = pset1<Packet>((Scalar)0);
+  }
+  if (N > 2) {
+    acc.packet[2] = pset1<Packet>((Scalar)0);
+  }
+  if (N > 3) {
+    acc.packet[3] = pset1<Packet>((Scalar)0);
+  }
 }
 
 // Scale the PacketBlock vectors by alpha.
-template<typename Packet>
-EIGEN_STRONG_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha)
-{
-  acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
-  acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
-  acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
-  acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
-}
-
-template<typename Packet>
-EIGEN_STRONG_INLINE void bscale(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ, const Packet& pAlpha)
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
 {
   acc.packet[0] = pmadd(pAlpha, accZ.packet[0], acc.packet[0]);
+  if (N > 1) {
+    acc.packet[1] = pmadd(pAlpha, accZ.packet[1], acc.packet[1]);
+  }
+  if (N > 2) {
+    acc.packet[2] = pmadd(pAlpha, accZ.packet[2], acc.packet[2]);
+  }
+  if (N > 3) {
+    acc.packet[3] = pmadd(pAlpha, accZ.packet[3], acc.packet[3]);
+  }
 }
 
-template<typename Packet>
-EIGEN_STRONG_INLINE void bscalec_common(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha)
-{
-  acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
-  acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
-  acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
-  acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
-}
-
-template<typename Packet>
-EIGEN_STRONG_INLINE void bscalec_common(PacketBlock<Packet,1>& acc, PacketBlock<Packet,1>& accZ, const Packet& pAlpha)
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha)
 {
   acc.packet[0] = pmul<Packet>(accZ.packet[0], pAlpha);
+  if (N > 1) {
+    acc.packet[1] = pmul<Packet>(accZ.packet[1], pAlpha);
+  }
+  if (N > 2) {
+    acc.packet[2] = pmul<Packet>(accZ.packet[2], pAlpha);
+  }
+  if (N > 3) {
+    acc.packet[3] = pmul<Packet>(accZ.packet[3], pAlpha);
+  }
 }
 
 // Complex version of PacketBlock scaling.
 template<typename Packet, int N>
-EIGEN_STRONG_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag)
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag)
 {
-  bscalec_common<Packet>(cReal, aReal, bReal);
+  bscalec_common<Packet, N>(cReal, aReal, bReal);
 
-  bscalec_common<Packet>(cImag, aImag, bReal);
+  bscalec_common<Packet, N>(cImag, aImag, bReal);
 
-  pger_common<Packet, true>(&cReal, bImag, aImag.packet);
+  pger_common<Packet, true, N>(&cReal, bImag, aImag.packet);
 
-  pger_common<Packet, false>(&cImag, bImag, aReal.packet);
+  pger_common<Packet, false, N>(&cImag, bImag, aReal.packet);
 }
 
-template<typename Packet>
-EIGEN_STRONG_INLINE void band(PacketBlock<Packet,4>& acc, const Packet& pMask)
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet,N>& acc, const Packet& pMask)
 {
   acc.packet[0] = pand(acc.packet[0], pMask);
-  acc.packet[1] = pand(acc.packet[1], pMask);
-  acc.packet[2] = pand(acc.packet[2], pMask);
-  acc.packet[3] = pand(acc.packet[3], pMask);
+  if (N > 1) {
+    acc.packet[1] = pand(acc.packet[1], pMask);
+  }
+  if (N > 2) {
+    acc.packet[2] = pand(acc.packet[2], pMask);
+  }
+  if (N > 3) {
+    acc.packet[3] = pand(acc.packet[3], pMask);
+  }
 }
 
-template<typename Packet>
-EIGEN_STRONG_INLINE void bscalec(PacketBlock<Packet,4>& aReal, PacketBlock<Packet,4>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,4>& cReal, PacketBlock<Packet,4>& cImag, const Packet& pMask)
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag, const Packet& pMask)
 {
-  band<Packet>(aReal, pMask);
-  band<Packet>(aImag, pMask);
+  band<Packet, N>(aReal, pMask);
+  band<Packet, N>(aImag, pMask);
 
-  bscalec<Packet,4>(aReal, aImag, bReal, bImag, cReal, cImag);
+  bscalec<Packet,N>(aReal, aImag, bReal, bImag, cReal, cImag);
 }
 
 // Load a PacketBlock, the N parameters make tunning gemm easier so we can add more accumulators as needed.
-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
-EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col)
-{
-  if (StorageOrder == RowMajor) {
-    acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
-    acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
-    acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
-    acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
-  } else {
-    acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
-    acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
-    acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
-    acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
-  }
-}
-
-// An overload of bload when you have a PacketBLock with 8 vectors.
-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
-EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col)
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N*(Complex?2:1)>& acc, const DataMapper& res, Index row, Index col)
 {
   if (StorageOrder == RowMajor) {
-    acc.packet[0] = res.template loadPacket<Packet>(row + 0, col + N*accCols);
-    acc.packet[1] = res.template loadPacket<Packet>(row + 1, col + N*accCols);
-    acc.packet[2] = res.template loadPacket<Packet>(row + 2, col + N*accCols);
-    acc.packet[3] = res.template loadPacket<Packet>(row + 3, col + N*accCols);
-    acc.packet[4] = res.template loadPacket<Packet>(row + 0, col + (N+1)*accCols);
-    acc.packet[5] = res.template loadPacket<Packet>(row + 1, col + (N+1)*accCols);
-    acc.packet[6] = res.template loadPacket<Packet>(row + 2, col + (N+1)*accCols);
-    acc.packet[7] = res.template loadPacket<Packet>(row + 3, col + (N+1)*accCols);
+    acc.packet[0] = res.template loadPacket<Packet>(row + 0, col);
+    if (N > 1) {
+      acc.packet[1] = res.template loadPacket<Packet>(row + 1, col);
+    }
+    if (N > 2) {
+      acc.packet[2] = res.template loadPacket<Packet>(row + 2, col);
+    }
+    if (N > 3) {
+      acc.packet[3] = res.template loadPacket<Packet>(row + 3, col);
+    }
+    if (Complex) {
+      acc.packet[0+N] = res.template loadPacket<Packet>(row + 0, col + accCols);
+      if (N > 1) {
+        acc.packet[1+N] = res.template loadPacket<Packet>(row + 1, col + accCols);
+      }
+      if (N > 2) {
+        acc.packet[2+N] = res.template loadPacket<Packet>(row + 2, col + accCols);
+      }
+      if (N > 3) {
+        acc.packet[3+N] = res.template loadPacket<Packet>(row + 3, col + accCols);
+      }
+    }
   } else {
-    acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
-    acc.packet[1] = res.template loadPacket<Packet>(row + N*accCols, col + 1);
-    acc.packet[2] = res.template loadPacket<Packet>(row + N*accCols, col + 2);
-    acc.packet[3] = res.template loadPacket<Packet>(row + N*accCols, col + 3);
-    acc.packet[4] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
-    acc.packet[5] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 1);
-    acc.packet[6] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 2);
-    acc.packet[7] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 3);
+    acc.packet[0] = res.template loadPacket<Packet>(row, col + 0);
+    if (N > 1) {
+      acc.packet[1] = res.template loadPacket<Packet>(row, col + 1);
+    }
+    if (N > 2) {
+      acc.packet[2] = res.template loadPacket<Packet>(row, col + 2);
+    }
+    if (N > 3) {
+      acc.packet[3] = res.template loadPacket<Packet>(row, col + 3);
+    }
+    if (Complex) {
+      acc.packet[0+N] = res.template loadPacket<Packet>(row + accCols, col + 0);
+      if (N > 1) {
+        acc.packet[1+N] = res.template loadPacket<Packet>(row + accCols, col + 1);
+      }
+      if (N > 2) {
+        acc.packet[2+N] = res.template loadPacket<Packet>(row + accCols, col + 2);
+      }
+      if (N > 3) {
+        acc.packet[3+N] = res.template loadPacket<Packet>(row + accCols, col + 3);
+      }
+    }
   }
 }
 
-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
-EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,2>& acc, const DataMapper& res, Index row, Index col)
-{
-  acc.packet[0] = res.template loadPacket<Packet>(row + N*accCols, col + 0);
-  acc.packet[1] = res.template loadPacket<Packet>(row + (N+1)*accCols, col + 0);
-}
-
 const static Packet4i mask41 = { -1,  0,  0,  0 };
 const static Packet4i mask42 = { -1, -1,  0,  0 };
 const static Packet4i mask43 = { -1, -1, -1,  0 };
@@ -1246,7 +1279,7 @@ const static Packet4i mask43 = { -1, -1, -1,  0 };
 const static Packet2l mask21 = { -1, 0 };
 
 template<typename Packet>
-EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows)
+EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows)
 {
   if (remaining_rows == 0) {
     return pset1<Packet>(float(0.0));  // Not used
@@ -1260,7 +1293,7 @@ EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows)
 }
 
 template<>
-EIGEN_STRONG_INLINE Packet2d bmask<Packet2d>(const int remaining_rows)
+EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const int remaining_rows)
 {
   if (remaining_rows == 0) {
     return pset1<Packet2d>(double(0.0));  // Not used
@@ -1269,22 +1302,44 @@ EIGEN_STRONG_INLINE Packet2d bmask<Packet2d>(const int remaining_rows)
   }
 }
 
-template<typename Packet>
-EIGEN_STRONG_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha, const Packet& pMask)
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha, const Packet& pMask)
 {
-  band<Packet>(accZ, pMask);
+  band<Packet, N>(accZ, pMask);
 
-  bscale<Packet>(acc, accZ, pAlpha);
+  bscale<Packet, N>(acc, accZ, pAlpha);
 }
 
-template<typename Packet>
-EIGEN_STRONG_INLINE void pbroadcast4_old(const __UNPACK_TYPE__(Packet)* a, Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+template<typename Packet, int N> EIGEN_ALWAYS_INLINE void
+pbroadcastN_old(const __UNPACK_TYPE__(Packet) *a,
+                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
+{
+  a0 = pset1<Packet>(a[0]);
+  if (N > 1) {
+    a1 = pset1<Packet>(a[1]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a1);
+  }
+  if (N > 2) {
+    a2 = pset1<Packet>(a[2]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a2);
+  }
+  if (N > 3) {
+    a3 = pset1<Packet>(a[3]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a3);
+  }
+}
+
+template<>
+EIGEN_ALWAYS_INLINE void pbroadcastN_old<Packet4f,4>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
 {
-  pbroadcast4<Packet>(a, a0, a1, a2, a3);
+  pbroadcast4<Packet4f>(a, a0, a1, a2, a3);
 }
 
 template<>
-EIGEN_STRONG_INLINE void pbroadcast4_old<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
+EIGEN_ALWAYS_INLINE void pbroadcastN_old<Packet2d,4>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3)
 {
   a1 = pload<Packet2d>(a);
   a3 = pload<Packet2d>(a + 2);
@@ -1294,89 +1349,96 @@ EIGEN_STRONG_INLINE void pbroadcast4_old<Packet2d>(const double* a, Packet2d& a0
   a3 = vec_splat(a3, 1);
 }
 
-// PEEL loop factor.
-#define PEEL 7
-
-template<typename Scalar, typename Packet, typename Index>
-EIGEN_STRONG_INLINE void MICRO_EXTRA_COL(
-  const Scalar* &lhs_ptr,
-  const Scalar* &rhs_ptr,
-  PacketBlock<Packet,1> &accZero,
-  Index remaining_rows,
-  Index remaining_cols)
+template<typename Packet, int N> EIGEN_ALWAYS_INLINE void
+pbroadcastN(const __UNPACK_TYPE__(Packet) *a,
+                      Packet& a0, Packet& a1, Packet& a2, Packet& a3)
 {
-  Packet rhsV[1];
-  rhsV[0] = pset1<Packet>(rhs_ptr[0]);
-  pger<1,Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
-  lhs_ptr += remaining_rows;
-  rhs_ptr += remaining_cols;
+  a0 = pset1<Packet>(a[0]);
+  if (N > 1) {
+    a1 = pset1<Packet>(a[1]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a1);
+  }
+  if (N > 2) {
+    a2 = pset1<Packet>(a[2]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a2);
+  }
+  if (N > 3) {
+    a3 = pset1<Packet>(a[3]);
+  } else {
+    EIGEN_UNUSED_VARIABLE(a3);
+  }
 }
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
-EIGEN_STRONG_INLINE void gemm_extra_col(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index row,
-  Index col,
-  Index remaining_rows,
-  Index remaining_cols,
-  const Packet& pAlpha)
+template<> EIGEN_ALWAYS_INLINE void
+pbroadcastN<Packet4f,4>(const float *a,
+                      Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3)
 {
-  const Scalar* rhs_ptr = rhs_base;
-  const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
-  PacketBlock<Packet,1> accZero;
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
 
-  bsetzero<Scalar, Packet>(accZero);
+// PEEL loop factor.
+#define PEEL 7
+#define PEEL_ROW 7
 
-  Index remaining_depth = (depth & -accRows);
-  Index k = 0;
-  for(; k + PEEL <= remaining_depth; k+= PEEL)
-  {
-    EIGEN_POWER_PREFETCH(rhs_ptr);
-    EIGEN_POWER_PREFETCH(lhs_ptr);
-    for (int l = 0; l < PEEL; l++) {
-      MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
-    }
-  }
-  for(; k < remaining_depth; k++)
-  {
-    MICRO_EXTRA_COL<Scalar, Packet, Index>(lhs_ptr, rhs_ptr, accZero, remaining_rows, remaining_cols);
+#define MICRO_UNROLL_PEEL(func) \
+  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_ZERO_PEEL(peel) \
+  if ((PEEL_ROW > peel) && (peel != 0)) { \
+    bsetzero<Scalar, Packet, accRows>(accZero##peel); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accZero##peel); \
   }
-  for(; k < depth; k++)
-  {
-    Packet rhsV[1];
-    rhsV[0] = pset1<Packet>(rhs_ptr[0]);
-    pger<1, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
-    lhs_ptr += remaining_rows;
-    rhs_ptr += remaining_cols;
+
+#define MICRO_ZERO_PEEL_ROW \
+  MICRO_UNROLL_PEEL(MICRO_ZERO_PEEL);
+
+#define MICRO_WORK_PEEL(peel) \
+  if (PEEL_ROW > peel) { \
+    pbroadcastN<Packet,accRows>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    pger<accRows, Scalar, Packet, false>(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
   }
 
-  accZero.packet[0] = vec_mul(pAlpha, accZero.packet[0]);
-  for(Index i = 0; i < remaining_rows; i++) {
-    res(row + i, col) += accZero.packet[0][i];
+#define MICRO_WORK_PEEL_ROW \
+  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \
+  MICRO_UNROLL_PEEL(MICRO_WORK_PEEL); \
+  lhs_ptr += (remaining_rows * PEEL_ROW); \
+  rhs_ptr += (accRows * PEEL_ROW);
+
+#define MICRO_ADD_PEEL(peel, sum) \
+  if (PEEL_ROW > peel) { \
+    for (Index i = 0; i < accRows; i++) { \
+      accZero##sum.packet[i] += accZero##peel.packet[i]; \
+    } \
   }
-}
 
-template<typename Scalar, typename Packet, typename Index, const Index accRows>
-EIGEN_STRONG_INLINE void MICRO_EXTRA_ROW(
+#define MICRO_ADD_PEEL_ROW \
+  MICRO_ADD_PEEL(4, 0) MICRO_ADD_PEEL(5, 1) MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) \
+  MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0)
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(
   const Scalar* &lhs_ptr,
   const Scalar* &rhs_ptr,
-  PacketBlock<Packet,4> &accZero,
-  Index remaining_rows)
+  PacketBlock<Packet,accRows> &accZero)
 {
   Packet rhsV[4];
-  pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
-  pger<4, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
+  pbroadcastN<Packet,accRows>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+  pger<accRows, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
   lhs_ptr += remaining_rows;
   rhs_ptr += accRows;
 }
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_extra_row(
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
   const Scalar* rhs_base,
@@ -1387,105 +1449,125 @@ EIGEN_STRONG_INLINE void gemm_extra_row(
   Index col,
   Index rows,
   Index cols,
-  Index remaining_rows,
   const Packet& pAlpha,
   const Packet& pMask)
 {
   const Scalar* rhs_ptr = rhs_base;
   const Scalar* lhs_ptr = lhs_base + row*strideA + remaining_rows*offsetA;
-  PacketBlock<Packet,4> accZero, acc;
+  PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc;
 
-  bsetzero<Scalar, Packet>(accZero);
+  bsetzero<Scalar, Packet, accRows>(accZero0);
 
-  Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
+  Index remaining_depth = (col + quad_traits<Scalar>::rows < cols) ? depth : (depth & -quad_traits<Scalar>::rows);
   Index k = 0;
-  for(; k + PEEL <= remaining_depth; k+= PEEL)
-  {
-    EIGEN_POWER_PREFETCH(rhs_ptr);
-    EIGEN_POWER_PREFETCH(lhs_ptr);
-    for (int l = 0; l < PEEL; l++) {
-      MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
-    }
+  if (remaining_depth >= PEEL_ROW) {
+    MICRO_ZERO_PEEL_ROW
+    do
+    {
+      EIGEN_POWER_PREFETCH(rhs_ptr);
+      EIGEN_POWER_PREFETCH(lhs_ptr);
+      MICRO_WORK_PEEL_ROW
+    } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth);
+    MICRO_ADD_PEEL_ROW
   }
   for(; k < remaining_depth; k++)
   {
-    MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows>(lhs_ptr, rhs_ptr, accZero, remaining_rows);
+    MICRO_EXTRA_ROW<Scalar, Packet, Index, accRows, remaining_rows>(lhs_ptr, rhs_ptr, accZero0);
   }
 
   if ((remaining_depth == depth) && (rows >= accCols))
   {
-    for(Index j = 0; j < 4; j++) {
-      acc.packet[j] = res.template loadPacket<Packet>(row, col + j);
-    }
-    bscale<Packet>(acc, accZero, pAlpha, pMask);
-    res.template storePacketBlock<Packet,4>(row, col, acc);
+    bload<DataMapper, Packet, Index, 0, ColMajor, false, accRows>(acc, res, row, 0);
+    bscale<Packet,accRows>(acc, accZero0, pAlpha, pMask);
+    res.template storePacketBlock<Packet,accRows>(row, 0, acc);
   } else {
     for(; k < depth; k++)
     {
       Packet rhsV[4];
-      pbroadcast4<Packet>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
-      pger<4, Scalar, Packet, Index, false>(&accZero, lhs_ptr, rhsV, remaining_rows);
+      pbroadcastN<Packet,accRows>(rhs_ptr, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+      pger<accRows, Scalar, Packet, Index, false, remaining_rows>(&accZero0, lhs_ptr, rhsV);
       lhs_ptr += remaining_rows;
       rhs_ptr += accRows;
     }
 
-    for(Index j = 0; j < 4; j++) {
-      accZero.packet[j] = vec_mul(pAlpha, accZero.packet[j]);
-    }
-    for(Index j = 0; j < 4; j++) {
+    for(Index j = 0; j < accRows; j++) {
+      accZero0.packet[j] = vec_mul(pAlpha, accZero0.packet[j]);
       for(Index i = 0; i < remaining_rows; i++) {
-        res(row + i, col + j) += accZero.packet[j][i];
+        res(row + i, j) += accZero0.packet[j][i];
       }
     }
   }
 }
 
-#define MICRO_UNROLL(func) \
-  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
-
-#define MICRO_UNROLL_WORK(func, func2, peel) \
-    MICRO_UNROLL(func2); \
-    func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
-    func(4,peel) func(5,peel) func(6,peel) func(7,peel)
-
-#define MICRO_LOAD_ONE(iter) \
-  if (unroll_factor > iter) { \
-    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
-    lhs_ptr##iter += accCols; \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
-  }
-
-#define MICRO_WORK_ONE(iter, peel) \
-  if (unroll_factor > iter) { \
-    pger_common<Packet, false>(&accZero##iter, lhsV##iter, rhsV##peel); \
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  switch(remaining_rows) {
+    case 1:
+      gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, 1>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask);
+      break;
+    case 2:
+      if (sizeof(Scalar) == sizeof(float)) {
+        gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, 2>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask);
+      }
+      break;
+    default:
+      if (sizeof(Scalar) == sizeof(float)) {
+        gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, Index, accRows, accCols, 3>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, pAlpha, pMask);
+      }
+      break;
   }
+}
 
-#define MICRO_TYPE_PEEL4(func, func2, peel) \
-  if (PEEL > peel) { \
-    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
-    pbroadcast4<Packet>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
-    MICRO_UNROLL_WORK(func, func2, peel) \
+#define MICRO_UNROLL(func) \
+  func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_UNROLL_WORK(func, func2, peel) \
+    MICRO_UNROLL(func2); \
+    func(0,peel) func(1,peel) func(2,peel) func(3,peel) \
+    func(4,peel) func(5,peel) func(6,peel) func(7,peel)
+
+#define MICRO_LOAD_ONE(iter) \
+  if (unroll_factor > iter) { \
+    lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr##iter); \
+    lhs_ptr##iter += accCols; \
   } else { \
-    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter); \
   }
 
-#define MICRO_TYPE_PEEL1(func, func2, peel) \
+#define MICRO_WORK_ONE(iter, peel) \
+  if (unroll_factor > iter) { \
+    pger_common<Packet, false, accRows>(&accZero##iter, lhsV##iter, rhsV##peel); \
+  }
+
+#define MICRO_TYPE_PEEL4(func, func2, peel) \
   if (PEEL > peel) { \
     Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
-    rhsV##peel[0] = pset1<Packet>(rhs_ptr[remaining_cols * peel]); \
+    pbroadcastN<Packet,accRows>(rhs_ptr + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
     MICRO_UNROLL_WORK(func, func2, peel) \
   } else { \
     EIGEN_UNUSED_VARIABLE(rhsV##peel); \
   }
 
 #define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2) \
-  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M]; \
   func(func1,func2,0); func(func1,func2,1); \
   func(func1,func2,2); func(func1,func2,3); \
   func(func1,func2,4); func(func1,func2,5); \
-  func(func1,func2,6); func(func1,func2,7); \
-  func(func1,func2,8); func(func1,func2,9);
+  func(func1,func2,6); func(func1,func2,7);
 
 #define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
   Packet rhsV0[M]; \
@@ -1499,17 +1581,9 @@ EIGEN_STRONG_INLINE void gemm_extra_row(
   MICRO_UNROLL_TYPE_ONE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
   rhs_ptr += accRows;
 
-#define MICRO_ONE_PEEL1 \
-  MICRO_UNROLL_TYPE_PEEL(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
-  rhs_ptr += (remaining_cols * PEEL);
-
-#define MICRO_ONE1 \
-  MICRO_UNROLL_TYPE_ONE(1, MICRO_TYPE_PEEL1, MICRO_WORK_ONE, MICRO_LOAD_ONE); \
-  rhs_ptr += remaining_cols;
-
 #define MICRO_DST_PTR_ONE(iter) \
   if (unroll_factor > iter) { \
-    bsetzero<Scalar, Packet>(accZero##iter); \
+    bsetzero<Scalar, Packet, accRows>(accZero##iter); \
   } else { \
     EIGEN_UNUSED_VARIABLE(accZero##iter); \
   }
@@ -1518,7 +1592,7 @@ EIGEN_STRONG_INLINE void gemm_extra_row(
 
 #define MICRO_SRC_PTR_ONE(iter) \
   if (unroll_factor > iter) { \
-    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \
   } else { \
     EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
   }
@@ -1534,25 +1608,13 @@ EIGEN_STRONG_INLINE void gemm_extra_row(
 
 #define MICRO_STORE_ONE(iter) \
   if (unroll_factor > iter) { \
-    acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
-    acc.packet[1] = res.template loadPacket<Packet>(row + iter*accCols, col + 1); \
-    acc.packet[2] = res.template loadPacket<Packet>(row + iter*accCols, col + 2); \
-    acc.packet[3] = res.template loadPacket<Packet>(row + iter*accCols, col + 3); \
-    bscale<Packet>(acc, accZero##iter, pAlpha); \
-    res.template storePacketBlock<Packet,4>(row + iter*accCols, col, acc); \
+    bload<DataMapper, Packet, Index, 0, ColMajor, false, accRows>(acc, res, row + iter*accCols, 0); \
+    bscale<Packet,accRows>(acc, accZero##iter, pAlpha); \
+    res.template storePacketBlock<Packet,accRows>(row + iter*accCols, 0, acc); \
   }
 
 #define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
 
-#define MICRO_COL_STORE_ONE(iter) \
-  if (unroll_factor > iter) { \
-    acc.packet[0] = res.template loadPacket<Packet>(row + iter*accCols, col + 0); \
-    bscale<Packet>(acc, accZero##iter, pAlpha); \
-    res.template storePacketBlock<Packet,1>(row + iter*accCols, col, acc); \
-  }
-
-#define MICRO_COL_STORE MICRO_UNROLL(MICRO_COL_STORE_ONE)
-
 template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
 EIGEN_STRONG_INLINE void gemm_unrolled_iteration(
   const DataMapper& res,
@@ -1560,16 +1622,13 @@ EIGEN_STRONG_INLINE void gemm_unrolled_iteration(
   const Scalar* rhs_base,
   Index depth,
   Index strideA,
-  Index offsetA,
   Index& row,
-  Index col,
   const Packet& pAlpha)
 {
-asm("#gemm begin");
   const Scalar* rhs_ptr = rhs_base;
-  const Scalar* lhs_ptr0, *  lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7;
-  PacketBlock<Packet,4> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
-  PacketBlock<Packet,4> acc;
+  const Scalar* lhs_ptr0 = NULL, *  lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
+  PacketBlock<Packet,accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+  PacketBlock<Packet,accRows> acc;
 
   MICRO_SRC_PTR
   MICRO_DST_PTR
@@ -1588,104 +1647,102 @@ asm("#gemm begin");
   MICRO_STORE
 
   row += unroll_factor*accCols;
-asm("#gemm end");
 }
 
-template<int unroll_factor, typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_unrolled_col_iteration(
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_cols(
   const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
+  const Scalar* blockA,
+  const Scalar* blockB,
   Index depth,
   Index strideA,
   Index offsetA,
-  Index& row,
+  Index strideB,
+  Index offsetB,
   Index col,
-  Index remaining_cols,
-  const Packet& pAlpha)
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
 {
-  const Scalar* rhs_ptr = rhs_base;
-  const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, *lhs_ptr7;
-  PacketBlock<Packet,1> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
-  PacketBlock<Packet,1> acc;
-
-  MICRO_SRC_PTR
-  MICRO_DST_PTR
+  const DataMapper res3 = res.getSubMapper(0, col);
 
-  Index k = 0;
-  for(; k + PEEL <= depth; k+= PEEL)
-  {
-    EIGEN_POWER_PREFETCH(rhs_ptr);
-    MICRO_PREFETCH
-    MICRO_ONE_PEEL1
-  }
-  for(; k < depth; k++)
-  {
-    MICRO_ONE1
-  }
-  MICRO_COL_STORE
+  const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
 
-  row += unroll_factor*accCols;
-}
-
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_unrolled_col(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index& row,
-  Index rows,
-  Index col,
-  Index remaining_cols,
-  const Packet& pAlpha)
-{
 #define MAX_UNROLL 6
   while(row + MAX_UNROLL*accCols <= rows) {
-    gemm_unrolled_col_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+    gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
   }
   switch( (rows-row)/accCols ) {
 #if MAX_UNROLL > 7
     case 7:
-      gemm_unrolled_col_iteration<7, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
       break;
 #endif
 #if MAX_UNROLL > 6
     case 6:
-      gemm_unrolled_col_iteration<6, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+      gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
       break;
 #endif
 #if MAX_UNROLL > 5
-   case 5:
-      gemm_unrolled_col_iteration<5, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+    case 5:
+      gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
       break;
 #endif
 #if MAX_UNROLL > 4
-   case 4:
-      gemm_unrolled_col_iteration<4, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
+    case 4:
+      gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
       break;
 #endif
 #if MAX_UNROLL > 3
-   case 3:
-     gemm_unrolled_col_iteration<3, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
-     break;
+    case 3:
+      gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
 #if MAX_UNROLL > 2
-   case 2:
-     gemm_unrolled_col_iteration<2, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
-     break;
+    case 2:
+      gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
 #if MAX_UNROLL > 1
-   case 1:
-     gemm_unrolled_col_iteration<1, Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_cols, pAlpha);
-     break;
+    case 1:
+      gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
-   default:
-     break;
+    default:
+      break;
   }
 #undef MAX_UNROLL
+
+  if(remaining_rows > 0)
+  {
+    gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
+}
+
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
+EIGEN_STRONG_INLINE void gemm_extra_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
+{
+  for (; col < cols; col++) {
+    gemm_cols<Scalar, Packet, DataMapper, Index, 1, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
 }
 
 /****************
@@ -1695,7 +1752,6 @@ template<typename Scalar, typename Index, typename Packet, typename RhsPacket, t
 EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const Index remaining_rows = rows % accCols;
-      const Index remaining_cols = cols % accRows;
 
       if( strideA == -1 ) strideA = depth;
       if( strideB == -1 ) strideB = depth;
@@ -1706,79 +1762,10 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
-        const Scalar* lhs_base = blockA;
-        Index row = 0;
-
-#define MAX_UNROLL 6
-        while(row + MAX_UNROLL*accCols <= rows) {
-          gemm_unrolled_iteration<MAX_UNROLL, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-        }
-        switch( (rows-row)/accCols ) {
-#if MAX_UNROLL > 7
-          case 7:
-            gemm_unrolled_iteration<7, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_UNROLL > 6
-          case 6:
-            gemm_unrolled_iteration<6, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_UNROLL > 5
-          case 5:
-            gemm_unrolled_iteration<5, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_UNROLL > 4
-          case 4:
-            gemm_unrolled_iteration<4, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_UNROLL > 3
-          case 3:
-            gemm_unrolled_iteration<3, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_UNROLL > 2
-          case 2:
-            gemm_unrolled_iteration<2, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-#if MAX_UNROLL > 1
-          case 1:
-            gemm_unrolled_iteration<1, Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
-#endif
-          default:
-            break;
-        }
-#undef MAX_UNROLL
-
-        if(remaining_rows > 0)
-        {
-          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
-        }
-    }
-
-    if(remaining_cols > 0)
-    {
-      const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
-      const Scalar* lhs_base = blockA;
-
-      for(; col < cols; col++)
-      {
-        Index row = 0;
-
-        gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
-
-        if (remaining_rows > 0)
-        {
-          gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
-        }
-        rhs_base++;
+        gemm_cols<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
       }
-    }
+
+      gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
 }
 
 #define accColsC (accCols / 2)
@@ -1787,117 +1774,66 @@ EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const
 
 // PEEL_COMPLEX loop factor.
 #define PEEL_COMPLEX 3
+#define PEEL_COMPLEX_ROW 3
 
-template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_COL(
-  const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
-  const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag,
-  PacketBlock<Packet,1> &accReal, PacketBlock<Packet,1> &accImag,
-  Index remaining_rows,
-  Index remaining_cols)
-{
-  Packet rhsV[1], rhsVi[1];
-  rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
-  if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
-  pgerc<1, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
-  lhs_ptr_real += remaining_rows;
-  if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
-  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
-  rhs_ptr_real += remaining_cols;
-  if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
-  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
-}
-
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_extra_col(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index strideB,
-  Index row,
-  Index col,
-  Index remaining_rows,
-  Index remaining_cols,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag)
-{
-  const Scalar* rhs_ptr_real = rhs_base;
-  const Scalar* rhs_ptr_imag;
-  if(!RhsIsReal) rhs_ptr_imag = rhs_base + remaining_cols*strideB;
-  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
-  const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
-  const Scalar* lhs_ptr_imag;
-  if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
-  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
-  PacketBlock<Packet,1> accReal, accImag;
-  PacketBlock<Packet,1> taccReal, taccImag;
-  PacketBlock<Packetc,1> acc0, acc1;
-
-  bsetzero<Scalar, Packet>(accReal);
-  bsetzero<Scalar, Packet>(accImag);
+#define MICRO_COMPLEX_UNROLL_PEEL(func) \
+  func(0) func(1) func(2) func(3)
 
-  Index remaining_depth = (depth & -accRows);
-  Index k = 0;
-  for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
-  {
-    EIGEN_POWER_PREFETCH(rhs_ptr_real);
-    if(!RhsIsReal) {
-      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
-    }
-    EIGEN_POWER_PREFETCH(lhs_ptr_real);
-    if(!LhsIsReal) {
-      EIGEN_POWER_PREFETCH(lhs_ptr_imag);
-    }
-    for (int l = 0; l < PEEL_COMPLEX; l++) {
-      MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
-    }
-  }
-  for(; k < remaining_depth; k++)
-  {
-    MICRO_COMPLEX_EXTRA_COL<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows, remaining_cols);
+#define MICRO_COMPLEX_ZERO_PEEL(peel) \
+  if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \
+    bsetzero<Scalar, Packet, accRows>(accReal##peel); \
+    bsetzero<Scalar, Packet, accRows>(accImag##peel); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(accReal##peel); \
+    EIGEN_UNUSED_VARIABLE(accImag##peel); \
   }
 
-  for(; k < depth; k++)
-  {
-    Packet rhsV[1], rhsVi[1];
-    rhsV[0] = pset1<Packet>(rhs_ptr_real[0]);
-    if(!RhsIsReal) rhsVi[0] = pset1<Packet>(rhs_ptr_imag[0]);
-    pgerc<1, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
-    lhs_ptr_real += remaining_rows;
-    if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
-    rhs_ptr_real += remaining_cols;
-    if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
+#define MICRO_COMPLEX_ZERO_PEEL_ROW \
+  MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_ZERO_PEEL);
+
+#define MICRO_COMPLEX_WORK_PEEL(peel) \
+  if (PEEL_COMPLEX_ROW > peel) { \
+    pbroadcastN_old<Packet,accRows>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    if(!RhsIsReal) pbroadcastN_old<Packet,accRows>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
+    pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel), lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel); \
+  } else { \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
   }
 
-  bscalec<Packet,1>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
-  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
+#define MICRO_COMPLEX_WORK_PEEL_ROW \
+  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4]; \
+  Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \
+  MICRO_COMPLEX_UNROLL_PEEL(MICRO_COMPLEX_WORK_PEEL); \
+  lhs_ptr_real += (remaining_rows * PEEL_COMPLEX_ROW); \
+  if(!LhsIsReal) lhs_ptr_imag += (remaining_rows * PEEL_COMPLEX_ROW); \
+  else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag); \
+  rhs_ptr_real += (accRows * PEEL_COMPLEX_ROW); \
+  if(!RhsIsReal) rhs_ptr_imag += (accRows * PEEL_COMPLEX_ROW); \
+  else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
 
-  if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
-  {
-    res(row + 0, col + 0) += pfirst<Packetc>(acc0.packet[0]);
-  } else {
-    acc0.packet[0] += res.template loadPacket<Packetc>(row + 0, col + 0);
-    res.template storePacketBlock<Packetc,1>(row + 0, col + 0, acc0);
-    if(remaining_rows > accColsC) {
-      res(row + accColsC, col + 0) += pfirst<Packetc>(acc1.packet[0]);
-    }
+#define MICRO_COMPLEX_ADD_PEEL(peel, sum) \
+  if (PEEL_COMPLEX_ROW > peel) { \
+    for (Index i = 0; i < accRows; i++) { \
+      accReal##sum.packet[i] += accReal##peel.packet[i]; \
+      accImag##sum.packet[i] += accImag##peel.packet[i]; \
+    } \
   }
-}
 
-template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_ROW(
+#define MICRO_COMPLEX_ADD_PEEL_ROW \
+  MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) \
+  MICRO_COMPLEX_ADD_PEEL(1, 0)
+
+template<typename Scalar, typename Packet, typename Index, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(
   const Scalar* &lhs_ptr_real, const Scalar* &lhs_ptr_imag,
   const Scalar* &rhs_ptr_real, const Scalar* &rhs_ptr_imag,
-  PacketBlock<Packet,4> &accReal, PacketBlock<Packet,4> &accImag,
-  Index remaining_rows)
+  PacketBlock<Packet,accRows> &accReal, PacketBlock<Packet,accRows> &accImag)
 {
   Packet rhsV[4], rhsVi[4];
-  pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
-  if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
-  pgerc<4, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
+  pbroadcastN_old<Packet,accRows>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+  if(!RhsIsReal) pbroadcastN_old<Packet,accRows>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
+  pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
   lhs_ptr_real += remaining_rows;
   if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
   else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
@@ -1906,8 +1842,8 @@ EIGEN_STRONG_INLINE void MICRO_COMPLEX_EXTRA_ROW(
   else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
 }
 
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_extra_row(
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
   const Scalar* rhs_base,
@@ -1919,108 +1855,141 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row(
   Index col,
   Index rows,
   Index cols,
-  Index remaining_rows,
   const Packet& pAlphaReal,
   const Packet& pAlphaImag,
   const Packet& pMask)
 {
-asm("#gemm_complex begin");
   const Scalar* rhs_ptr_real = rhs_base;
-  const Scalar* rhs_ptr_imag;
+  const Scalar* rhs_ptr_imag = NULL;
   if(!RhsIsReal) rhs_ptr_imag = rhs_base + accRows*strideB;
   else EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
   const Scalar* lhs_ptr_real = lhs_base + advanceRows*row*strideA + remaining_rows*offsetA;
-  const Scalar* lhs_ptr_imag;
+  const Scalar* lhs_ptr_imag = NULL;
   if(!LhsIsReal) lhs_ptr_imag = lhs_ptr_real + remaining_rows*strideA;
   else EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
-  PacketBlock<Packet,4> accReal, accImag;
-  PacketBlock<Packet,4> taccReal, taccImag;
-  PacketBlock<Packetc,4> acc0, acc1;
-  PacketBlock<Packetc,8> tRes;
+  PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet,accRows> taccReal, taccImag;
+  PacketBlock<Packetc,accRows> acc0, acc1;
+  PacketBlock<Packetc,accRows*2> tRes;
 
-  bsetzero<Scalar, Packet>(accReal);
-  bsetzero<Scalar, Packet>(accImag);
+  bsetzero<Scalar, Packet, accRows>(accReal0);
+  bsetzero<Scalar, Packet, accRows>(accImag0);
 
-  Index remaining_depth = (col + accRows < cols) ? depth : (depth & -accRows);
+  Index remaining_depth = (col + quad_traits<Scalar>::rows < cols) ? depth : (depth & -quad_traits<Scalar>::rows);
   Index k = 0;
-  for(; k + PEEL_COMPLEX <= remaining_depth; k+= PEEL_COMPLEX)
-  {
-    EIGEN_POWER_PREFETCH(rhs_ptr_real);
-    if(!RhsIsReal) {
-      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
-    }
-    EIGEN_POWER_PREFETCH(lhs_ptr_real);
-    if(!LhsIsReal) {
-      EIGEN_POWER_PREFETCH(lhs_ptr_imag);
-    }
-    for (int l = 0; l < PEEL_COMPLEX; l++) {
-      MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
-    }
+  if (remaining_depth >= PEEL_COMPLEX_ROW) {
+    MICRO_COMPLEX_ZERO_PEEL_ROW
+    do
+    {
+      EIGEN_POWER_PREFETCH(rhs_ptr_real);
+      if(!RhsIsReal) {
+        EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+      }
+      EIGEN_POWER_PREFETCH(lhs_ptr_real);
+      if(!LhsIsReal) {
+        EIGEN_POWER_PREFETCH(lhs_ptr_imag);
+      }
+      MICRO_COMPLEX_WORK_PEEL_ROW
+    } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth);
+    MICRO_COMPLEX_ADD_PEEL_ROW
   }
   for(; k < remaining_depth; k++)
   {
-    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal, accImag, remaining_rows);
+    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, Index, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real, rhs_ptr_imag, accReal0, accImag0);
   }
 
   if ((remaining_depth == depth) && (rows >= accCols))
   {
-    bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row, col);
-    bscalec<Packet>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
-    bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1);
-    res.template storePacketBlock<Packetc,4>(row + 0, col, acc0);
-    res.template storePacketBlock<Packetc,4>(row + accColsC, col, acc1);
+    bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, accRows>(tRes, res, row, 0);
+    bscalec<Packet,accRows>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+    bcouple<Packet, Packetc, accRows>(taccReal, taccImag, tRes, acc0, acc1);
+    res.template storePacketBlock<Packetc,accRows>(row + 0, 0, acc0);
+    res.template storePacketBlock<Packetc,accRows>(row + accColsC, 0, acc1);
   } else {
     for(; k < depth; k++)
     {
       Packet rhsV[4], rhsVi[4];
-      pbroadcast4_old<Packet>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
-      if(!RhsIsReal) pbroadcast4_old<Packet>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
-      pgerc<4, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi, remaining_rows);
+      pbroadcastN_old<Packet,accRows>(rhs_ptr_real, rhsV[0], rhsV[1], rhsV[2], rhsV[3]);
+      if(!RhsIsReal) pbroadcastN_old<Packet,accRows>(rhs_ptr_imag, rhsVi[0], rhsVi[1], rhsVi[2], rhsVi[3]);
+      pgerc<accRows, Scalar, Packet, Index, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(&accReal0, &accImag0, lhs_ptr_real, lhs_ptr_imag, rhsV, rhsVi);
       lhs_ptr_real += remaining_rows;
       if(!LhsIsReal) lhs_ptr_imag += remaining_rows;
       rhs_ptr_real += accRows;
       if(!RhsIsReal) rhs_ptr_imag += accRows;
     }
 
-    bscalec<Packet,4>(accReal, accImag, pAlphaReal, pAlphaImag, taccReal, taccImag);
-    bcouple_common<Packet, Packetc>(taccReal, taccImag, acc0, acc1);
+    bscalec<Packet,accRows>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag);
+    bcouple_common<Packet, Packetc, accRows>(taccReal, taccImag, acc0, acc1);
 
     if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1))
     {
-      for(Index j = 0; j < 4; j++) {
-        res(row + 0, col + j) += pfirst<Packetc>(acc0.packet[j]);
+      for(Index j = 0; j < accRows; j++) {
+        res(row + 0, j) += pfirst<Packetc>(acc0.packet[j]);
       }
     } else {
-      for(Index j = 0; j < 4; j++) {
+      for(Index j = 0; j < accRows; j++) {
         PacketBlock<Packetc,1> acc2;
-        acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, col + j) + acc0.packet[j];
-        res.template storePacketBlock<Packetc,1>(row + 0, col + j, acc2);
+        acc2.packet[0] = res.template loadPacket<Packetc>(row + 0, j) + acc0.packet[j];
+        res.template storePacketBlock<Packetc,1>(row + 0, j, acc2);
         if(remaining_rows > accColsC) {
-          res(row + accColsC, col + j) += pfirst<Packetc>(acc1.packet[j]);
+          res(row + accColsC, j) += pfirst<Packetc>(acc1.packet[j]);
         }
       }
     }
   }
-asm("#gemm_complex end");
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
+  const DataMapper& res,
+  const Scalar* lhs_base,
+  const Scalar* rhs_base,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index row,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+  switch(remaining_rows) {
+    case 1:
+      gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, 1>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask);
+      break;
+    case 2:
+      if (sizeof(Scalar) == sizeof(float)) {
+        gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, 2>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask);
+      }
+      break;
+    default:
+      if (sizeof(Scalar) == sizeof(float)) {
+        gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, 3>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, pAlphaReal, pAlphaImag, pMask);
+      }
+      break;
+  }
 }
 
 #define MICRO_COMPLEX_UNROLL(func) \
-  func(0) func(1) func(2) func(3) func(4)
+  func(0) func(1) func(2) func(3)
 
 #define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
     MICRO_COMPLEX_UNROLL(func2); \
-    func(0,peel) func(1,peel) func(2,peel) func(3,peel) func(4,peel)
+    func(0,peel) func(1,peel) func(2,peel) func(3,peel)
 
 #define MICRO_COMPLEX_LOAD_ONE(iter) \
   if (unroll_factor > iter) { \
     lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
-    lhs_ptr_real##iter += accCols; \
     if(!LhsIsReal) { \
-      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
-      lhs_ptr_imag##iter += accCols; \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter + imag_delta); \
     } else { \
       EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
     } \
+    lhs_ptr_real##iter += accCols; \
   } else { \
     EIGEN_UNUSED_VARIABLE(lhsV##iter); \
     EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
@@ -2028,37 +1997,16 @@ asm("#gemm_complex end");
 
 #define MICRO_COMPLEX_WORK_ONE4(iter, peel) \
   if (unroll_factor > iter) { \
-    pgerc_common<4, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
-  }
-
-#define MICRO_COMPLEX_WORK_ONE1(iter, peel) \
-  if (unroll_factor > iter) { \
-    pgerc_common<1, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+    pgerc_common<accRows, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
   }
 
 #define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
   if (PEEL_COMPLEX > peel) { \
-    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
-    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
-    pbroadcast4_old<Packet>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
+    pbroadcastN_old<Packet,accRows>(rhs_ptr_real + (accRows * peel), rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]); \
     if(!RhsIsReal) { \
-      pbroadcast4_old<Packet>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
-    } else { \
-      EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
-    } \
-    MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
-  } else { \
-    EIGEN_UNUSED_VARIABLE(rhsV##peel); \
-    EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
-  }
-
-#define MICRO_COMPLEX_TYPE_PEEL1(func, func2, peel) \
-  if (PEEL_COMPLEX > peel) { \
-    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
-    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
-    rhsV##peel[0] = pset1<Packet>(rhs_ptr_real[remaining_cols * peel]); \
-    if(!RhsIsReal) { \
-      rhsVi##peel[0] = pset1<Packet>(rhs_ptr_imag[remaining_cols * peel]); \
+      pbroadcastN_old<Packet,accRows>(rhs_ptr_imag + (accRows * peel), rhsVi##peel[0], rhsVi##peel[1], rhsVi##peel[2], rhsVi##peel[3]); \
     } else { \
       EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
     } \
@@ -2069,13 +2017,10 @@ asm("#gemm_complex end");
   }
 
 #define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
-  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M], rhsV8[M], rhsV9[M]; \
-  Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M], rhsVi4[M], rhsVi5[M], rhsVi6[M], rhsVi7[M], rhsVi8[M], rhsVi9[M]; \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M]; \
+  Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M]; \
   func(func1,func2,0); func(func1,func2,1); \
-  func(func1,func2,2); func(func1,func2,3); \
-  func(func1,func2,4); func(func1,func2,5); \
-  func(func1,func2,6); func(func1,func2,7); \
-  func(func1,func2,8); func(func1,func2,9);
+  func(func1,func2,2); func(func1,func2,3);
 
 #define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
   Packet rhsV0[M], rhsVi0[M];\
@@ -2091,20 +2036,10 @@ asm("#gemm_complex end");
   rhs_ptr_real += accRows; \
   if(!RhsIsReal) rhs_ptr_imag += accRows;
 
-#define MICRO_COMPLEX_ONE_PEEL1 \
-  MICRO_COMPLEX_UNROLL_TYPE_PEEL(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
-  rhs_ptr_real += (remaining_cols * PEEL_COMPLEX); \
-  if(!RhsIsReal) rhs_ptr_imag += (remaining_cols * PEEL_COMPLEX);
-
-#define MICRO_COMPLEX_ONE1 \
-  MICRO_COMPLEX_UNROLL_TYPE_ONE(1, MICRO_COMPLEX_TYPE_PEEL1, MICRO_COMPLEX_WORK_ONE1, MICRO_COMPLEX_LOAD_ONE); \
-  rhs_ptr_real += remaining_cols; \
-  if(!RhsIsReal) rhs_ptr_imag += remaining_cols;
-
 #define MICRO_COMPLEX_DST_PTR_ONE(iter) \
   if (unroll_factor > iter) { \
-    bsetzero<Scalar, Packet>(accReal##iter); \
-    bsetzero<Scalar, Packet>(accImag##iter); \
+    bsetzero<Scalar, Packet, accRows>(accReal##iter); \
+    bsetzero<Scalar, Packet, accRows>(accImag##iter); \
   } else { \
     EIGEN_UNUSED_VARIABLE(accReal##iter); \
     EIGEN_UNUSED_VARIABLE(accImag##iter); \
@@ -2114,15 +2049,9 @@ asm("#gemm_complex end");
 
 #define MICRO_COMPLEX_SRC_PTR_ONE(iter) \
   if (unroll_factor > iter) { \
-    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
-    if(!LhsIsReal) { \
-      lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
-    } else { \
-      EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
-    } \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \
   } else { \
     EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
-    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
   }
 
 #define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
@@ -2130,35 +2059,21 @@ asm("#gemm_complex end");
 #define MICRO_COMPLEX_PREFETCH_ONE(iter) \
   if (unroll_factor > iter) { \
     EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
-    if(!LhsIsReal) { \
-      EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
-    } \
   }
 
 #define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
 
 #define MICRO_COMPLEX_STORE_ONE(iter) \
   if (unroll_factor > iter) { \
-    bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
-    bscalec<Packet,4>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
-    bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
-    res.template storePacketBlock<Packetc,4>(row + iter*accCols + 0, col, acc0); \
-    res.template storePacketBlock<Packetc,4>(row + iter*accCols + accColsC, col, acc1); \
+    bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, accRows>(tRes, res, row + iter*accCols, 0); \
+    bscalec<Packet,accRows>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
+    bcouple<Packet, Packetc, accRows>(taccReal, taccImag, tRes, acc0, acc1); \
+    res.template storePacketBlock<Packetc,accRows>(row + iter*accCols + 0, 0, acc0); \
+    res.template storePacketBlock<Packetc,accRows>(row + iter*accCols + accColsC, 0, acc1); \
   }
 
 #define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
 
-#define MICRO_COMPLEX_COL_STORE_ONE(iter) \
-  if (unroll_factor > iter) { \
-    bload<DataMapper, Packetc, Index, accColsC, 0, ColMajor>(tRes, res, row + iter*accCols, col); \
-    bscalec<Packet,1>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, taccImag); \
-    bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc0, acc1); \
-    res.template storePacketBlock<Packetc,1>(row + iter*accCols + 0, col, acc0); \
-    res.template storePacketBlock<Packetc,1>(row + iter*accCols + accColsC, col, acc1); \
-  }
-
-#define MICRO_COMPLEX_COL_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_COL_STORE_ONE)
-
 template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration(
   const DataMapper& res,
@@ -2166,30 +2081,26 @@ EIGEN_STRONG_INLINE void gemm_complex_unrolled_iteration(
   const Scalar* rhs_base,
   Index depth,
   Index strideA,
-  Index offsetA,
   Index strideB,
   Index& row,
-  Index col,
   const Packet& pAlphaReal,
   const Packet& pAlphaImag)
 {
-asm("#gemm_complex_unrolled begin");
   const Scalar* rhs_ptr_real = rhs_base;
-  const Scalar* rhs_ptr_imag;
+  const Scalar* rhs_ptr_imag = NULL;
+  const Index imag_delta = accCols*strideA;
   if(!RhsIsReal) {
     rhs_ptr_imag = rhs_base + accRows*strideB;
   } else {
     EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
   }
-  const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1;
-  const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3;
-  const Scalar* lhs_ptr_real4, * lhs_ptr_imag4;
-  PacketBlock<Packet,4> accReal0, accImag0, accReal1, accImag1;
-  PacketBlock<Packet,4> accReal2, accImag2, accReal3, accImag3;
-  PacketBlock<Packet,4> accReal4, accImag4;
-  PacketBlock<Packet,4> taccReal, taccImag;
-  PacketBlock<Packetc,4> acc0, acc1;
-  PacketBlock<Packetc,8> tRes;
+  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
+  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
+  PacketBlock<Packet,accRows> accReal0, accImag0, accReal1, accImag1;
+  PacketBlock<Packet,accRows> accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet,accRows> taccReal, taccImag;
+  PacketBlock<Packetc,accRows> acc0, acc1;
+  PacketBlock<Packetc,accRows*2> tRes;
 
   MICRO_COMPLEX_SRC_PTR
   MICRO_COMPLEX_DST_PTR
@@ -2211,115 +2122,95 @@ asm("#gemm_complex_unrolled begin");
   MICRO_COMPLEX_STORE
 
   row += unroll_factor*accCols;
-asm("#gemm_complex_unrolled end");
 }
 
-template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_unrolled_col_iteration(
+template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_cols(
   const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
+  const Scalar* blockA,
+  const Scalar* blockB,
   Index depth,
   Index strideA,
   Index offsetA,
   Index strideB,
-  Index& row,
+  Index offsetB,
   Index col,
-  Index remaining_cols,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
   const Packet& pAlphaReal,
-  const Packet& pAlphaImag)
+  const Packet& pAlphaImag,
+  const Packet& pMask)
 {
-  const Scalar* rhs_ptr_real = rhs_base;
-  const Scalar* rhs_ptr_imag;
-  if(!RhsIsReal) {
-    rhs_ptr_imag = rhs_base + remaining_cols*strideB;
-  } else {
-    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
-  }
-  const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1;
-  const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3;
-  const Scalar* lhs_ptr_real4, * lhs_ptr_imag4;
-  PacketBlock<Packet,1> accReal0, accImag0, accReal1, accImag1;
-  PacketBlock<Packet,1> accReal2, accImag2, accReal3, accImag3;
-  PacketBlock<Packet,1> accReal4, accImag4;
-  PacketBlock<Packet,1> taccReal, taccImag;
-  PacketBlock<Packetc,1> acc0, acc1;
-  PacketBlock<Packetc,2> tRes;
+  const DataMapper res3 = res.getSubMapper(0, col);
 
-  MICRO_COMPLEX_SRC_PTR
-  MICRO_COMPLEX_DST_PTR
+  const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
 
-  Index k = 0;
-  for(; k + PEEL_COMPLEX <= depth; k+= PEEL_COMPLEX)
-  {
-    EIGEN_POWER_PREFETCH(rhs_ptr_real);
-    if(!RhsIsReal) {
-      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
-    }
-    MICRO_COMPLEX_PREFETCH
-    MICRO_COMPLEX_ONE_PEEL1
+#define MAX_COMPLEX_UNROLL 3
+  while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
+    gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
   }
-  for(; k < depth; k++)
-  {
-    MICRO_COMPLEX_ONE1
+  switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_UNROLL > 4
+    case 4:
+      gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_UNROLL > 3
+    case 3:
+      gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_UNROLL > 2
+    case 2:
+      gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_UNROLL > 1
+    case 1:
+      gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+    default:
+      break;
   }
-  MICRO_COMPLEX_COL_STORE
+#undef MAX_COMPLEX_UNROLL
 
-  row += unroll_factor*accCols;
+  if(remaining_rows > 0)
+  {
+    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
 }
 
 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
+EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
   const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
+  const Scalar* blockA,
+  const Scalar* blockB,
   Index depth,
   Index strideA,
   Index offsetA,
   Index strideB,
-  Index& row,
-  Index rows,
+  Index offsetB,
   Index col,
-  Index remaining_cols,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
   const Packet& pAlphaReal,
-  const Packet& pAlphaImag)
+  const Packet& pAlphaImag,
+  const Packet& pMask)
 {
-#define MAX_COMPLEX_UNROLL 3
-  while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
-    gemm_complex_unrolled_col_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
+  for (; col < cols; col++) {
+    gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, Index, 1, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
   }
-  switch( (rows-row)/accCols ) {
-#if MAX_COMPLEX_UNROLL > 4
-   case 4:
-     gemm_complex_unrolled_col_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
-     break;
-#endif
-#if MAX_COMPLEX_UNROLL > 3
-   case 3:
-     gemm_complex_unrolled_col_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
-     break;
-#endif
-#if MAX_COMPLEX_UNROLL > 2
-   case 2:
-     gemm_complex_unrolled_col_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
-     break;
-#endif
-#if MAX_COMPLEX_UNROLL > 1
-   case 1:
-     gemm_complex_unrolled_col_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_cols, pAlphaReal, pAlphaImag);
-     break;
-#endif
-   default:
-     break;
-  }
-#undef MAX_COMPLEX_UNROLL
 }
 
 template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const Index remaining_rows = rows % accCols;
-      const Index remaining_cols = cols % accRows;
 
       if( strideA == -1 ) strideA = depth;
       if( strideB == -1 ) strideB = depth;
@@ -2334,64 +2225,10 @@ EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* bl
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
-        const Scalar* lhs_base = blockA;
-        Index row = 0;
-
-#define MAX_COMPLEX_UNROLL 3
-        while(row + MAX_COMPLEX_UNROLL*accCols <= rows) {
-          gemm_complex_unrolled_iteration<MAX_COMPLEX_UNROLL, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-        }
-        switch( (rows-row)/accCols ) {
-#if MAX_COMPLEX_UNROLL > 4
-          case 4:
-            gemm_complex_unrolled_iteration<4, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-#if MAX_COMPLEX_UNROLL > 3
-          case 3:
-            gemm_complex_unrolled_iteration<3, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-#if MAX_COMPLEX_UNROLL > 2
-          case 2:
-            gemm_complex_unrolled_iteration<2, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-#if MAX_COMPLEX_UNROLL > 1
-          case 1:
-            gemm_complex_unrolled_iteration<1, Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-          default:
-            break;
-        }
-#undef MAX_COMPLEX_UNROLL
-
-        if(remaining_rows > 0)
-        {
-          gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
-        }
+        gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
       }
 
-      if(remaining_cols > 0)
-      {
-        const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
-        const Scalar* lhs_base = blockA;
-
-        for(; col < cols; col++)
-        {
-          Index row = 0;
-
-          gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
-
-          if (remaining_rows > 0)
-          {
-            gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
-          }
-          rhs_base++;
-        }
-      }
+      gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
 }
 
 #undef accColsC
@@ -2429,6 +2266,7 @@ void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Co
     pack(blockA, lhs, depth, rows, stride, offset);
 }
 
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
 struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
 {
@@ -2456,6 +2294,7 @@ void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode
   dhs_pack<double, Index, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
+#endif
 
 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
 struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
@@ -2484,6 +2323,7 @@ void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Con
   dhs_pack<float, Index, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
   pack(blockA, lhs, depth, rows, stride, offset);
 }
+
 template<typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
 struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>
 {
@@ -2512,6 +2352,7 @@ void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet,
   pack(blockA, lhs, depth, rows, stride, offset);
 }
 
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
 struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
 {
@@ -2539,6 +2380,7 @@ void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>
   dhs_pack<float, Index, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
   pack(blockB, rhs, depth, cols, stride, offset);
 }
+#endif
 
 template<typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
 struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>
@@ -2646,10 +2488,10 @@ void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, Conjugat
     const Index accCols = quad_traits<float>::size;
     void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index, Index, Index);
 
-    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+    #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
       gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
-    #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+    #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
         gemm_function = &Eigen::internal::gemmMMA<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
@@ -2659,7 +2501,7 @@ void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, Conjugat
     #else
       gemm_function = &Eigen::internal::gemm<float, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #endif
-      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2685,20 +2527,20 @@ void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr
     void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*,
           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
 
-    #ifdef EIGEN_ALTIVEC_MMA_ONLY
-       //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
-       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-       }
-       else{
-         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-       }
-     #else
-       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-     #endif
-      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    #if defined(EIGEN_ALTIVEC_MMA_ONLY)
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+    #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+    #endif
+    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2723,20 +2565,20 @@ void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, Conjugat
     const Index accCols = quad_traits<float>::size;
     void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*,
           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
-    #ifdef EIGEN_ALTIVEC_MMA_ONLY
-       //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
-       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-       }
-       else{
-         gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-       }
-     #else
-       gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-     #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    #if defined(EIGEN_ALTIVEC_MMA_ONLY)
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+    #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+    #endif
+    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2761,20 +2603,20 @@ void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, Conjugat
     const Index accCols = quad_traits<float>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*,
           Index, Index, Index, std::complex<float>, Index, Index, Index, Index);
-    #ifdef EIGEN_ALTIVEC_MMA_ONLY
-       //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
-       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-       }
-       else{
-         gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-       }
-     #else
-       gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-     #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    #if defined(EIGEN_ALTIVEC_MMA_ONLY)
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+    #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+    #endif
+    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2798,10 +2640,10 @@ void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, Conjug
     const Index accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index, Index, Index, Index);
 
-    #ifdef EIGEN_ALTIVEC_MMA_ONLY
+    #if defined(EIGEN_ALTIVEC_MMA_ONLY)
       //generate with MMA only
       gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
-    #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+    #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
         gemm_function = &Eigen::internal::gemmMMA<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
       }
@@ -2811,7 +2653,7 @@ void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, Conjug
     #else
       gemm_function = &Eigen::internal::gemm<double, Index, Packet, RhsPacket, DataMapper, accRows, accCols>;
     #endif
-      gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2836,20 +2678,20 @@ void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper,
     const Index accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*,
           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
-    #ifdef EIGEN_ALTIVEC_MMA_ONLY
-       //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
-       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-       }
-       else{
-         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-       }
-     #else
-       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
-     #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    #if defined(EIGEN_ALTIVEC_MMA_ONLY)
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+    #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, false>;
+    #endif
+    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2874,20 +2716,20 @@ void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, Conjug
     const Index accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*,
           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
-    #ifdef EIGEN_ALTIVEC_MMA_ONLY
-       //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
-       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-       }
-       else{
-         gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-       }
-     #else
-       gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
-     #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    #if defined(EIGEN_ALTIVEC_MMA_ONLY)
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+    #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, false, true>;
+    #endif
+    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 
 template<typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
@@ -2912,20 +2754,20 @@ void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, Conjug
     const Index accCols = quad_traits<double>::size;
     void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*,
           Index, Index, Index, std::complex<double>, Index, Index, Index, Index);
-    #ifdef EIGEN_ALTIVEC_MMA_ONLY
-       //generate with MMA only
-       gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-     #elif defined(ALTIVEC_MMA_SUPPORT) && !defined(EIGEN_ALTIVEC_DISABLE_MMA)
-       if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
-         gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-       }
-       else{
-         gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-       }
-     #else
-       gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
-     #endif
-       gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+    #if defined(EIGEN_ALTIVEC_MMA_ONLY)
+      //generate with MMA only
+      gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+    #elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+      if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")){
+        gemm_function = &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+      }
+      else{
+        gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+      }
+    #else
+      gemm_function = &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Index, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, true, false>;
+    #endif
+    gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
   }
 } // end namespace internal
 
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
index 6e74116b9e97e1fab00bc261dd676813f50f4dcc..bf01dba1cadd1d6be00108634fd5478c15eecd6b 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -9,22 +9,8 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows>
-EIGEN_STRONG_INLINE void gemm_extra_col(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index row,
-  Index col,
-  Index remaining_rows,
-  Index remaining_cols,
-  const Packet& pAlpha);
-
 template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_extra_row(
+EIGEN_ALWAYS_INLINE void gemm_extra_row(
   const DataMapper& res,
   const Scalar* lhs_base,
   const Scalar* rhs_base,
@@ -39,41 +25,28 @@ EIGEN_STRONG_INLINE void gemm_extra_row(
   const Packet& pAlpha,
   const Packet& pMask);
 
-template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_unrolled_col(
-  const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
-  Index depth,
-  Index strideA,
-  Index offsetA,
-  Index& row,
-  Index rows,
-  Index col,
-  Index remaining_cols,
-  const Packet& pAlpha);
-
-template<typename Packet>
-EIGEN_STRONG_INLINE Packet bmask(const int remaining_rows);
-
-template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_extra_col(
+template<typename Scalar, typename Packet, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_extra_cols(
   const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
+  const Scalar* blockA,
+  const Scalar* blockB,
   Index depth,
   Index strideA,
   Index offsetA,
   Index strideB,
-  Index row,
+  Index offsetB,
   Index col,
+  Index rows,
+  Index cols,
   Index remaining_rows,
-  Index remaining_cols,
-  const Packet& pAlphaReal,
-  const Packet& pAlphaImag);
+  const Packet& pAlpha,
+  const Packet& pMask);
+
+template<typename Packet>
+EIGEN_ALWAYS_INLINE Packet bmask(const int remaining_rows);
 
 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_extra_row(
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(
   const DataMapper& res,
   const Scalar* lhs_base,
   const Scalar* rhs_base,
@@ -91,130 +64,95 @@ EIGEN_STRONG_INLINE void gemm_complex_extra_row(
   const Packet& pMask);
 
 template<typename Scalar, typename Packet, typename Packetc, typename DataMapper, typename Index, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_unrolled_col(
+EIGEN_STRONG_INLINE void gemm_complex_extra_cols(
   const DataMapper& res,
-  const Scalar* lhs_base,
-  const Scalar* rhs_base,
+  const Scalar* blockA,
+  const Scalar* blockB,
   Index depth,
   Index strideA,
   Index offsetA,
   Index strideB,
-  Index& row,
-  Index rows,
+  Index offsetB,
   Index col,
-  Index remaining_cols,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
   const Packet& pAlphaReal,
-  const Packet& pAlphaImag);
+  const Packet& pAlphaImag,
+  const Packet& pMask);
 
 template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE Packet ploadLhs(const Scalar* lhs);
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const Scalar* lhs);
 
-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
-EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,4>& acc, const DataMapper& res, Index row, Index col);
-
-template<typename DataMapper, typename Packet, typename Index, const Index accCols, int N, int StorageOrder>
-EIGEN_STRONG_INLINE void bload(PacketBlock<Packet,8>& acc, const DataMapper& res, Index row, Index col);
-
-template<typename Packet>
-EIGEN_STRONG_INLINE void bscale(PacketBlock<Packet,4>& acc, PacketBlock<Packet,4>& accZ, const Packet& pAlpha);
+template<typename DataMapper, typename Packet, typename Index, const Index accCols, int StorageOrder, bool Complex, int N>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet,N>& acc, const DataMapper& res, Index row, Index col);
 
 template<typename Packet, int N>
-EIGEN_STRONG_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);
-
-const static Packet16uc p16uc_SETCOMPLEX32_FIRST = {  0,  1,  2,  3,
-                                                     16, 17, 18, 19,
-                                                      4,  5,  6,  7,
-                                                     20, 21, 22, 23};
-
-const static Packet16uc p16uc_SETCOMPLEX32_SECOND = {  8,  9, 10, 11,
-                                                      24, 25, 26, 27,
-                                                      12, 13, 14, 15,
-                                                      28, 29, 30, 31};
-//[a,b],[ai,bi] = [a,ai] - This is equivalent to p16uc_GETREAL64
-const static Packet16uc p16uc_SETCOMPLEX64_FIRST = {  0,  1,  2,  3,  4,  5,  6,  7,
-                                                     16, 17, 18, 19, 20, 21, 22, 23};
-
-//[a,b],[ai,bi] = [b,bi] - This is equivalent to p16uc_GETIMAG64
-const static Packet16uc p16uc_SETCOMPLEX64_SECOND = {  8,  9, 10, 11, 12, 13, 14, 15,
-                                                      24, 25, 26, 27, 28, 29, 30, 31};
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet,N>& acc, PacketBlock<Packet,N>& accZ, const Packet& pAlpha);
 
+template<typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet,N>& aReal, PacketBlock<Packet,N>& aImag, const Packet& bReal, const Packet& bImag, PacketBlock<Packet,N>& cReal, PacketBlock<Packet,N>& cImag);
 
 // Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
-template<typename Packet, typename Packetc>
-EIGEN_STRONG_INLINE void bcouple_common(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
-{
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
-  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_FIRST);
-  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_FIRST);
-  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_FIRST);
-
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
-  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX32_SECOND);
-  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX32_SECOND);
-  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX32_SECOND);
-}
-
-template<typename Packet, typename Packetc>
-EIGEN_STRONG_INLINE void bcouple(PacketBlock<Packet,4>& taccReal, PacketBlock<Packet,4>& taccImag, PacketBlock<Packetc,8>& tRes, PacketBlock<Packetc, 4>& acc1, PacketBlock<Packetc, 4>& acc2)
-{
-  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
-
-  acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
-  acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
-  acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
-  acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
-
-  acc2.packet[0] = padd<Packetc>(tRes.packet[4], acc2.packet[0]);
-  acc2.packet[1] = padd<Packetc>(tRes.packet[5], acc2.packet[1]);
-  acc2.packet[2] = padd<Packetc>(tRes.packet[6], acc2.packet[2]);
-  acc2.packet[3] = padd<Packetc>(tRes.packet[7], acc2.packet[3]);
-}
-
-template<typename Packet, typename Packetc>
-EIGEN_STRONG_INLINE void bcouple_common(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
+template<typename Packet, typename Packetc, int N>
+EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
 {
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_FIRST);
-
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX32_SECOND);
+  acc1.packet[0].v = vec_mergeh(taccReal.packet[0], taccImag.packet[0]);
+  if (N > 1) {
+    acc1.packet[1].v = vec_mergeh(taccReal.packet[1], taccImag.packet[1]);
+  }
+  if (N > 2) {
+    acc1.packet[2].v = vec_mergeh(taccReal.packet[2], taccImag.packet[2]);
+  }
+  if (N > 3) {
+    acc1.packet[3].v = vec_mergeh(taccReal.packet[3], taccImag.packet[3]);
+  }
+
+  acc2.packet[0].v = vec_mergel(taccReal.packet[0], taccImag.packet[0]);
+  if (N > 1) {
+    acc2.packet[1].v = vec_mergel(taccReal.packet[1], taccImag.packet[1]);
+  }
+  if (N > 2) {
+    acc2.packet[2].v = vec_mergel(taccReal.packet[2], taccImag.packet[2]);
+  }
+  if (N > 3) {
+    acc2.packet[3].v = vec_mergel(taccReal.packet[3], taccImag.packet[3]);
+  }
 }
 
-template<typename Packet, typename Packetc>
-EIGEN_STRONG_INLINE void bcouple(PacketBlock<Packet,1>& taccReal, PacketBlock<Packet,1>& taccImag, PacketBlock<Packetc,2>& tRes, PacketBlock<Packetc, 1>& acc1, PacketBlock<Packetc, 1>& acc2)
+template<typename Packet, typename Packetc, int N>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet,N>& taccReal, PacketBlock<Packet,N>& taccImag, PacketBlock<Packetc,N*2>& tRes, PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2)
 {
-  bcouple_common<Packet, Packetc>(taccReal, taccImag, acc1, acc2);
+  bcouple_common<Packet, Packetc, N>(taccReal, taccImag, acc1, acc2);
 
   acc1.packet[0] = padd<Packetc>(tRes.packet[0], acc1.packet[0]);
-
-  acc2.packet[0] = padd<Packetc>(tRes.packet[1], acc2.packet[0]);
-}
-
-template<>
-EIGEN_STRONG_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,4>& taccReal, PacketBlock<Packet2d,4>& taccImag, PacketBlock<Packet1cd, 4>& acc1, PacketBlock<Packet1cd, 4>& acc2)
-{
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
-  acc1.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_FIRST);
-  acc1.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_FIRST);
-  acc1.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_FIRST);
-
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
-  acc2.packet[1].v = vec_perm(taccReal.packet[1], taccImag.packet[1], p16uc_SETCOMPLEX64_SECOND);
-  acc2.packet[2].v = vec_perm(taccReal.packet[2], taccImag.packet[2], p16uc_SETCOMPLEX64_SECOND);
-  acc2.packet[3].v = vec_perm(taccReal.packet[3], taccImag.packet[3], p16uc_SETCOMPLEX64_SECOND);
-}
-
-template<>
-EIGEN_STRONG_INLINE void bcouple_common<Packet2d, Packet1cd>(PacketBlock<Packet2d,1>& taccReal, PacketBlock<Packet2d,1>& taccImag, PacketBlock<Packet1cd, 1>& acc1, PacketBlock<Packet1cd, 1>& acc2)
-{
-  acc1.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_FIRST);
-
-  acc2.packet[0].v = vec_perm(taccReal.packet[0], taccImag.packet[0], p16uc_SETCOMPLEX64_SECOND);
+  if (N > 1) {
+    acc1.packet[1] = padd<Packetc>(tRes.packet[1], acc1.packet[1]);
+  }
+  if (N > 2) {
+    acc1.packet[2] = padd<Packetc>(tRes.packet[2], acc1.packet[2]);
+  }
+  if (N > 3) {
+    acc1.packet[3] = padd<Packetc>(tRes.packet[3], acc1.packet[3]);
+  }
+
+  acc2.packet[0] = padd<Packetc>(tRes.packet[0+N], acc2.packet[0]);
+  if (N > 1) {
+    acc2.packet[1] = padd<Packetc>(tRes.packet[1+N], acc2.packet[1]);
+  }
+  if (N > 2) {
+    acc2.packet[2] = padd<Packetc>(tRes.packet[2+N], acc2.packet[2]);
+  }
+  if (N > 3) {
+    acc2.packet[3] = padd<Packetc>(tRes.packet[3+N], acc2.packet[3]);
+  }
 }
 
 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
 template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE Packet ploadRhs(const Scalar* rhs)
+EIGEN_ALWAYS_INLINE Packet ploadRhs(const Scalar* rhs)
 {
-  return *reinterpret_cast<Packet *>(const_cast<Scalar *>(rhs));
+  return ploadu<Packet>(rhs);
 }
 
 } // end namespace internal
diff --git a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
index 8edf79c4b7c3404aee6eacafa1bda23d55dcd962..7dda42339ee8e51137ac8e9d7301411666de9155 100644
--- a/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
+++ b/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -11,7 +11,11 @@
 #ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
 #define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
 
-#pragma GCC target("cpu=power10")
+// If using dynamic dispatch, set the CPU target.
+#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+#pragma GCC push_options
+#pragma GCC target("cpu=power10,htm")
+#endif
 
 #ifdef __has_builtin
 #if !__has_builtin(__builtin_vsx_assemble_pair)
@@ -24,48 +28,48 @@ namespace Eigen {
 namespace internal {
 
 template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE void bsetzeroMMA(__vector_quad* acc)
+EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc)
 {
   __builtin_mma_xxsetaccz(acc);
 }
 
 template<typename DataMapper, typename Index, typename Packet, const Index accCols>
-EIGEN_STRONG_INLINE void storeAccumulator(Index i, Index j, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
+EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, __vector_quad* acc)
 {
   PacketBlock<Packet, 4> result;
   __builtin_mma_disassemble_acc(&result.packet, acc);
 
   PacketBlock<Packet, 4> tRes;
-  bload<DataMapper, Packet, Index, accCols, 0, ColMajor>(tRes, data, i, j);
+  bload<DataMapper, Packet, Index, accCols, ColMajor, false, 4>(tRes, data, i, 0);
 
-  bscale<Packet>(tRes, result, alpha);
+  bscale<Packet, 4>(tRes, result, alpha);
 
-  data.template storePacketBlock<Packet, 4>(i, j, tRes);
+  data.template storePacketBlock<Packet, 4>(i, 0, tRes);
 }
 
-template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC, int N>
-EIGEN_STRONG_INLINE void storeComplexAccumulator(Index i, Index j, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
+template<typename DataMapper, typename Index, typename Packet, typename Packetc, const Index accColsC>
+EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal, const Packet& alphaImag, __vector_quad* accReal, __vector_quad* accImag)
 {
   PacketBlock<Packet, 4> resultReal, resultImag;
   __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
   __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
 
   PacketBlock<Packetc, 8> tRes;
-  bload<DataMapper, Packetc, Index, accColsC, N, ColMajor>(tRes, data, i, j);
+  bload<DataMapper, Packetc, Index, accColsC, ColMajor, true, 4>(tRes, data, i, 0);
 
   PacketBlock<Packet,4> taccReal, taccImag;
   bscalec<Packet,4>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag);
 
   PacketBlock<Packetc, 4> acc1, acc2;
-  bcouple<Packet, Packetc>(taccReal, taccImag, tRes, acc1, acc2);
+  bcouple<Packet, Packetc, 4>(taccReal, taccImag, tRes, acc1, acc2);
 
-  data.template storePacketBlock<Packetc, 4>(i + N*accColsC, j, acc1);
-  data.template storePacketBlock<Packetc, 4>(i + (N+1)*accColsC, j, acc2);
+  data.template storePacketBlock<Packetc, 4>(i, 0, acc1);
+  data.template storePacketBlock<Packetc, 4>(i + accColsC, 0, acc2);
 }
 
 // Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b)
 {
   if(NegativeAccumulate)
   {
@@ -76,7 +80,7 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const L
 }
 
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock<Packet2d,2>& a, const Packet2d& b)
 {
   __vector_pair* a0 = (__vector_pair *)(&a.packet[0]);
   if(NegativeAccumulate)
@@ -88,7 +92,7 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const PacketBlock<Packet2d,
 }
 
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b)
 {
   if(NegativeAccumulate)
   {
@@ -99,15 +103,13 @@ EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, con
 }
 
 template<typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
-EIGEN_STRONG_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet4f& b)
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad*, const __vector_pair&, const Packet4f&)
 {
-  EIGEN_UNUSED_VARIABLE(acc); // Just for compilation
-  EIGEN_UNUSED_VARIABLE(a);
-  EIGEN_UNUSED_VARIABLE(b);
+  // Just for compilation
 }
 
 template<typename Scalar, typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi)
+EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, const Packet& lhsVi, const RhsPacket& rhsV, const RhsPacket& rhsVi)
 {
   pgerMMA<Packet, RhsPacket, false>(accReal,  rhsV,  lhsV);
   if(LhsIsReal) {
@@ -125,20 +127,20 @@ EIGEN_STRONG_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag
 
 // This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
 template<typename Scalar, typename Packet>
-EIGEN_STRONG_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV)
 {
-  rhsV = ploadRhs<Scalar, Packet>((const Scalar*)(rhs));
+  rhsV = ploadRhs<Scalar, Packet>(rhs);
 } 
 
 template<>
-EIGEN_STRONG_INLINE void ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double* rhs, PacketBlock<Packet2d, 2>& rhsV)
+EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, PacketBlock<Packet2d, 2> >(const double* rhs, PacketBlock<Packet2d, 2>& rhsV)
 {
   rhsV.packet[0] = ploadRhs<double, Packet2d>((const double *)((Packet2d *)rhs      ));
   rhsV.packet[1] = ploadRhs<double, Packet2d>((const double *)(((Packet2d *)rhs) + 1));
 }
 
 template<>
-EIGEN_STRONG_INLINE void ploadRhsMMA<double, __vector_pair>(const double* rhs, __vector_pair& rhsV)
+EIGEN_ALWAYS_INLINE void ploadRhsMMA<double, __vector_pair>(const double* rhs, __vector_pair& rhsV)
 {
 #if EIGEN_COMP_LLVM
   __builtin_vsx_assemble_pair(&rhsV,
@@ -150,11 +152,9 @@ EIGEN_STRONG_INLINE void ploadRhsMMA<double, __vector_pair>(const double* rhs, _
 }
 
 template<>
-EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV)
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const float*, __vector_pair&)
 {
   // Just for compilation
-  EIGEN_UNUSED_VARIABLE(rhs);
-  EIGEN_UNUSED_VARIABLE(rhsV);
 }
 
 // PEEL_MMA loop factor.
@@ -188,12 +188,11 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV)
   }
 
 #define MICRO_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
-  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
+  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7; \
   MICRO_MMA_TYPE_PEEL(func,func2,type,0); MICRO_MMA_TYPE_PEEL(func,func2,type,1); \
   MICRO_MMA_TYPE_PEEL(func,func2,type,2); MICRO_MMA_TYPE_PEEL(func,func2,type,3); \
   MICRO_MMA_TYPE_PEEL(func,func2,type,4); MICRO_MMA_TYPE_PEEL(func,func2,type,5); \
-  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7); \
-  MICRO_MMA_TYPE_PEEL(func,func2,type,8); MICRO_MMA_TYPE_PEEL(func,func2,type,9);
+  MICRO_MMA_TYPE_PEEL(func,func2,type,6); MICRO_MMA_TYPE_PEEL(func,func2,type,7);
 
 #define MICRO_MMA_UNROLL_TYPE_ONE(func, func2, type) \
   type rhsV0; \
@@ -226,7 +225,7 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV)
 
 #define MICRO_MMA_SRC_PTR_ONE(iter) \
   if (unroll_factor > iter) { \
-    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols + accCols*offsetA; \
+    lhs_ptr##iter = lhs_base + ( (row/accCols) + iter )*strideA*accCols; \
   } else { \
     EIGEN_UNUSED_VARIABLE(lhs_ptr##iter); \
   }
@@ -242,26 +241,23 @@ EIGEN_STRONG_INLINE void ploadRhsMMA(const float* rhs, __vector_pair& rhsV)
 
 #define MICRO_MMA_STORE_ONE(iter) \
   if (unroll_factor > iter) { \
-    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, col, res, pAlpha, &accZero##iter); \
+    storeAccumulator<DataMapper, Index, Packet, accCols>(row + iter*accCols, res, pAlpha, &accZero##iter); \
   }
 
 #define MICRO_MMA_STORE MICRO_MMA_UNROLL(MICRO_MMA_STORE_ONE)
 
 template<int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
-EIGEN_STRONG_INLINE void gemm_unrolled_MMA_iteration(
+EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
   const Scalar* rhs_base,
   Index depth,
   Index strideA,
-  Index offsetA,
   Index& row,
-  Index col,
   const Packet& pAlpha)
 {
-asm("#gemm_MMA begin");
   const Scalar* rhs_ptr = rhs_base;
-  const Scalar* lhs_ptr0, * lhs_ptr1, * lhs_ptr2, * lhs_ptr3, * lhs_ptr4, * lhs_ptr5, * lhs_ptr6, * lhs_ptr7;
+  const Scalar* lhs_ptr0 = NULL, * lhs_ptr1 = NULL, * lhs_ptr2 = NULL, * lhs_ptr3 = NULL, * lhs_ptr4 = NULL, * lhs_ptr5 = NULL, * lhs_ptr6 = NULL, * lhs_ptr7 = NULL;
   __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
 
   MICRO_MMA_SRC_PTR
@@ -281,97 +277,100 @@ asm("#gemm_MMA begin");
   MICRO_MMA_STORE
 
   row += unroll_factor*accCols;
-asm("#gemm_MMA end");
 }
 
-template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
-void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+template<typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemmMMA_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlpha,
+  const Packet& pMask)
 {
-      const Index remaining_rows = rows % accCols;
-      const Index remaining_cols = cols % accRows;
+  const DataMapper res3 = res.getSubMapper(0, col);
 
-      if( strideA == -1 ) strideA = depth;
-      if( strideB == -1 ) strideB = depth;
-
-      const Packet pAlpha = pset1<Packet>(alpha);
-      const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
-
-      Index col = 0;
-      for(; col + accRows <= cols; col += accRows)
-      {
-        const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
-        const Scalar* lhs_base = blockA;
+  const Scalar* rhs_base = blockB + col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
 
-        Index row = 0;
 #define MAX_MMA_UNROLL 7
-        while(row + MAX_MMA_UNROLL*accCols <= rows) {
-          gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-        }
-        switch( (rows-row)/accCols ) {
+  while(row + MAX_MMA_UNROLL*accCols <= rows) {
+    gemm_unrolled_MMA_iteration<MAX_MMA_UNROLL, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+  }
+  switch( (rows-row)/accCols ) {
 #if MAX_MMA_UNROLL > 7
-          case 7:
-            gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
+    case 7:
+      gemm_unrolled_MMA_iteration<7, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
 #if MAX_MMA_UNROLL > 6
-          case 6:
-            gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
+    case 6:
+      gemm_unrolled_MMA_iteration<6, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
 #if MAX_MMA_UNROLL > 5
-          case 5:
-            gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
+    case 5:
+      gemm_unrolled_MMA_iteration<5, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
 #if MAX_MMA_UNROLL > 4
-          case 4:
-            gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
+    case 4:
+      gemm_unrolled_MMA_iteration<4, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
 #if MAX_MMA_UNROLL > 3
-          case 3:
-            gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
+    case 3:
+      gemm_unrolled_MMA_iteration<3, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
 #if MAX_MMA_UNROLL > 2
-          case 2:
-            gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
+    case 2:
+      gemm_unrolled_MMA_iteration<2, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
 #if MAX_MMA_UNROLL > 1
-          case 1:
-            gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, pAlpha);
-            break;
+    case 1:
+      gemm_unrolled_MMA_iteration<1, Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res3, lhs_base, rhs_base, depth, strideA, row, pAlpha);
+      break;
 #endif
-          default:
-            break;
-        }
+    default:
+      break;
+  }
 #undef MAX_MMA_UNROLL
 
-        if(remaining_rows > 0)
-        {
-          gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
-        }
-      }
+  if(remaining_rows > 0)
+  {
+    gemm_extra_row<Scalar, Packet, DataMapper, Index, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA, row, col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
+}
 
-      if(remaining_cols > 0)
-      {
-        const Scalar* rhs_base = blockB + col*strideB + remaining_cols*offsetB;
-        const Scalar* lhs_base = blockA;
+template<typename Scalar, typename Index, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols>
+void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
+{
+      const Index remaining_rows = rows % accCols;
 
-        for(; col < cols; col++)
-        {
-          Index row = 0;
+      if( strideA == -1 ) strideA = depth;
+      if( strideB == -1 ) strideB = depth;
 
-          gemm_unrolled_col<Scalar, Packet, DataMapper, Index, accCols>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, rows, col, remaining_cols, pAlpha);
+      const Packet pAlpha = pset1<Packet>(alpha);
+      const Packet pMask  = bmask<Packet>((const int)(remaining_rows));
 
-          if (remaining_rows > 0)
-          {
-            gemm_extra_col<Scalar, Packet, DataMapper, Index, accRows>(res, lhs_base, rhs_base, depth, strideA, offsetA, row, col, remaining_rows, remaining_cols, pAlpha);
-          }
-          rhs_base++;
-        }
+      Index col = 0;
+      for(; col + accRows <= cols; col += accRows)
+      {
+        gemmMMA_cols<Scalar, Packet, RhsPacket, DataMapper, Index, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
       }
+
+      gemm_extra_cols<Scalar, Packet, DataMapper, Index, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlpha, pMask);
 }
 
 #define accColsC (accCols / 2)
@@ -379,21 +378,20 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 #define advanceCols ((RhsIsReal) ? 1 : 2)
 
 // PEEL_COMPLEX_MMA loop factor.
-#define PEEL_COMPLEX_MMA 7
+#define PEEL_COMPLEX_MMA 3
 
 #define MICRO_COMPLEX_MMA_UNROLL(func) \
-  func(0) func(1) func(2) func(3) func(4)
+  func(0) func(1) func(2) func(3)
 
 #define MICRO_COMPLEX_MMA_LOAD_ONE(iter) \
   if (unroll_factor > iter) { \
     lhsV##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter); \
-    lhs_ptr_real##iter += accCols; \
     if(!LhsIsReal) { \
-      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_imag##iter); \
-      lhs_ptr_imag##iter += accCols; \
+      lhsVi##iter = ploadLhs<Scalar, Packet>(lhs_ptr_real##iter + imag_delta); \
     } else { \
       EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
     } \
+    lhs_ptr_real##iter += accCols; \
   } else { \
     EIGEN_UNUSED_VARIABLE(lhsV##iter); \
     EIGEN_UNUSED_VARIABLE(lhsVi##iter); \
@@ -406,8 +404,8 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 
 #define MICRO_COMPLEX_MMA_TYPE_PEEL(func, func2, type, peel) \
   if (PEEL_COMPLEX_MMA > peel) { \
-    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4; \
-    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3, lhsVi4; \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3; \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3; \
     ploadRhsMMA<Scalar, type>(rhs_ptr_real + (accRows * peel), rhsV##peel); \
     if(!RhsIsReal) { \
       ploadRhsMMA<Scalar, type>(rhs_ptr_imag + (accRows * peel), rhsVi##peel); \
@@ -415,20 +413,17 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
       EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
     } \
     MICRO_COMPLEX_MMA_UNROLL(func2); \
-    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) func(4,type,peel) \
+    func(0,type,peel) func(1,type,peel) func(2,type,peel) func(3,type,peel) \
   } else { \
     EIGEN_UNUSED_VARIABLE(rhsV##peel); \
     EIGEN_UNUSED_VARIABLE(rhsVi##peel); \
   }
 
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(func, func2, type) \
-  type rhsV0, rhsV1, rhsV2, rhsV3, rhsV4, rhsV5, rhsV6, rhsV7, rhsV8, rhsV9; \
-  type rhsVi0, rhsVi1, rhsVi2, rhsVi3, rhsVi4, rhsVi5, rhsVi6, rhsVi7, rhsVi8, rhsVi9; \
+  type rhsV0, rhsV1, rhsV2, rhsV3; \
+  type rhsVi0, rhsVi1, rhsVi2, rhsVi3; \
   MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,0); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,1); \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3); \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,4); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,5); \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,6); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,7); \
-  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,8); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,9);
+  MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,2); MICRO_COMPLEX_MMA_TYPE_PEEL(func,func2,type,3);
 
 #define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(func, func2, type) \
   type rhsV0, rhsVi0; \
@@ -465,15 +460,9 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 
 #define MICRO_COMPLEX_MMA_SRC_PTR_ONE(iter) \
   if (unroll_factor > iter) { \
-    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols + accCols*offsetA; \
-    if(!LhsIsReal) { \
-      lhs_ptr_imag##iter = lhs_ptr_real##iter + accCols*strideA; \
-    } else { \
-      EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
-    } \
+    lhs_ptr_real##iter = lhs_base + ( ((advanceRows*row)/accCols) + iter*advanceRows )*strideA*accCols; \
   } else { \
     EIGEN_UNUSED_VARIABLE(lhs_ptr_real##iter); \
-    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag##iter); \
   }
 
 #define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_SRC_PTR_ONE)
@@ -481,46 +470,40 @@ void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
 #define MICRO_COMPLEX_MMA_PREFETCH_ONE(iter) \
   if (unroll_factor > iter) { \
     EIGEN_POWER_PREFETCH(lhs_ptr_real##iter); \
-    if(!LhsIsReal) { \
-      EIGEN_POWER_PREFETCH(lhs_ptr_imag##iter); \
-    } \
   }
 
 #define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_PREFETCH_ONE)
 
 #define MICRO_COMPLEX_MMA_STORE_ONE(iter) \
   if (unroll_factor > iter) { \
-    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC, 0>(row + iter*accCols, col, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
+    storeComplexAccumulator<DataMapper, Index, Packet, Packetc, accColsC>(row + iter*accCols, res, pAlphaReal, pAlphaImag, &accReal##iter, &accImag##iter); \
   }
 
 #define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
 
 template<int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
-EIGEN_STRONG_INLINE void gemm_complex_unrolled_MMA_iteration(
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(
   const DataMapper& res,
   const Scalar* lhs_base,
   const Scalar* rhs_base,
   Index depth,
   Index strideA,
-  Index offsetA,
   Index strideB,
   Index& row,
-  Index col,
   const Packet& pAlphaReal,
   const Packet& pAlphaImag)
 {
-asm("#gemm_complex_MMA begin");
   const Scalar* rhs_ptr_real = rhs_base;
-  const Scalar* rhs_ptr_imag;
+  const Scalar* rhs_ptr_imag = NULL;
+  const Index imag_delta = accCols*strideA;
   if(!RhsIsReal) {
     rhs_ptr_imag = rhs_base + accRows*strideB;
   } else {
     EIGEN_UNUSED_VARIABLE(rhs_ptr_imag);
   }
-  const Scalar* lhs_ptr_real0, * lhs_ptr_imag0, * lhs_ptr_real1, * lhs_ptr_imag1;
-  const Scalar* lhs_ptr_real2, * lhs_ptr_imag2, * lhs_ptr_real3, * lhs_ptr_imag3;
-  const Scalar* lhs_ptr_real4, * lhs_ptr_imag4;
-  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3, accReal4, accImag4;
+  const Scalar* lhs_ptr_real0 = NULL, * lhs_ptr_real1 = NULL;
+  const Scalar* lhs_ptr_real2 = NULL, * lhs_ptr_real3 = NULL;
+  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
 
   MICRO_COMPLEX_MMA_SRC_PTR
   MICRO_COMPLEX_MMA_DST_PTR
@@ -542,14 +525,72 @@ asm("#gemm_complex_MMA begin");
   MICRO_COMPLEX_MMA_STORE
 
   row += unroll_factor*accCols;
-asm("#gemm_complex_MMA end");
+}
+
+template<typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, typename Index, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(
+  const DataMapper& res,
+  const Scalar* blockA,
+  const Scalar* blockB,
+  Index depth,
+  Index strideA,
+  Index offsetA,
+  Index strideB,
+  Index offsetB,
+  Index col,
+  Index rows,
+  Index cols,
+  Index remaining_rows,
+  const Packet& pAlphaReal,
+  const Packet& pAlphaImag,
+  const Packet& pMask)
+{
+  const DataMapper res3 = res.getSubMapper(0, col);
+
+  const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
+  const Scalar* lhs_base = blockA + accCols*offsetA;
+  Index row = 0;
+
+#define MAX_COMPLEX_MMA_UNROLL 4
+  while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
+    gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+  }
+  switch( (rows-row)/accCols ) {
+#if MAX_COMPLEX_MMA_UNROLL > 4
+    case 4:
+      gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 3
+    case 3:
+      gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 2
+    case 2:
+      gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 1
+    case 1:
+      gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, lhs_base, rhs_base, depth, strideA, strideB, row, pAlphaReal, pAlphaImag);
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_COMPLEX_MMA_UNROLL
+
+  if(remaining_rows > 0)
+  {
+    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
 }
 
 template<typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Index, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
 void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB)
 {
       const Index remaining_rows = rows % accCols;
-      const Index remaining_cols = cols % accRows;
 
       if( strideA == -1 ) strideA = depth;
       if( strideB == -1 ) strideB = depth;
@@ -564,74 +605,23 @@ void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsS
       Index col = 0;
       for(; col + accRows <= cols; col += accRows)
       {
-        const Scalar* rhs_base = blockB + advanceCols*col*strideB + accRows*offsetB;
-        const Scalar* lhs_base = blockA;
-        Index row = 0;
-
-#define MAX_COMPLEX_MMA_UNROLL 4
-        while(row + MAX_COMPLEX_MMA_UNROLL*accCols <= rows) {
-          gemm_complex_unrolled_MMA_iteration<MAX_COMPLEX_MMA_UNROLL, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-        }
-        switch( (rows-row)/accCols ) {
-#if MAX_COMPLEX_MMA_UNROLL > 4
-          case 4:
-            gemm_complex_unrolled_MMA_iteration<4, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-#if MAX_COMPLEX_MMA_UNROLL > 3
-          case 3:
-            gemm_complex_unrolled_MMA_iteration<3, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-#if MAX_COMPLEX_MMA_UNROLL > 2
-          case 2:
-            gemm_complex_unrolled_MMA_iteration<2, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-#if MAX_COMPLEX_MMA_UNROLL > 1
-          case 1:
-            gemm_complex_unrolled_MMA_iteration<1, Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, pAlphaReal, pAlphaImag);
-            break;
-#endif
-          default:
-            break;
-        }
-#undef MAX_COMPLEX_MMA_UNROLL
-
-        if(remaining_rows > 0)
-        {
-          gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
-        }
+        gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
       }
 
-      if(remaining_cols > 0)
-      {
-        const Scalar* rhs_base = blockB + advanceCols*col*strideB + remaining_cols*offsetB;
-        const Scalar* lhs_base = blockA;
-
-        for(; col < cols; col++)
-        {
-          Index row = 0;
-
-          gemm_complex_unrolled_col<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, col, remaining_cols, pAlphaReal, pAlphaImag);
-
-          if (remaining_rows > 0)
-          {
-            gemm_complex_extra_col<Scalar, Packet, Packetc, DataMapper, Index, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, col, remaining_rows, remaining_cols, pAlphaReal, pAlphaImag);
-          }
-          rhs_base++;
-        }
-      }
+      gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, Index, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols, remaining_rows, pAlphaReal, pAlphaImag, pMask);
 }
 
 #undef accColsC
 #undef advanceRows
 #undef advanceCols
 
-#pragma GCC reset_options
 } // end namespace internal
 
 } // end namespace Eigen
 
+#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+#pragma GCC pop_options
+#endif
+
 #endif // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
 
diff --git a/Eigen/src/Core/arch/AltiVec/PacketMath.h b/Eigen/src/Core/arch/AltiVec/PacketMath.h
index 7c70c07b1c86a269a1cacfa70b2fad8b2fb6fd4d..528f995d3535b0216adae4df5de0a3cbfa591517 100755
--- a/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -22,10 +22,6 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
@@ -437,7 +433,7 @@ EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet)* from)
   EIGEN_UNUSED_VARIABLE(from);
   EIGEN_DEBUG_ALIGNED_LOAD
 #ifdef __VSX__
-  return vec_xl(0, from);
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
 #else
   return vec_ld(0, from);
 #endif
@@ -790,8 +786,22 @@ template<> EIGEN_STRONG_INLINE Packet8us  psub<Packet8us> (const Packet8us&  a,
 template<> EIGEN_STRONG_INLINE Packet16c  psub<Packet16c> (const Packet16c&  a, const Packet16c&  b) { return a - b; }
 template<> EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return a - b; }
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
+{
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return vec_xor(a, p4f_MZERO);
+#endif
+}
+template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
+{
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return p4i_ZERO - a;
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
@@ -869,19 +879,31 @@ template<> EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, con
 template<> EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) { return vec_max(a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmple(a,b)); }
+// To fix bug with vec_cmplt on older versions
+#if defined(__POWER8_VECTOR__) || EIGEN_COMP_LLVM
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmplt(a,b)); }
+#endif
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) { return reinterpret_cast<Packet4f>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
-
-template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
-template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
-
 template<> EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
   Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a,b));
   return vec_nor(c,c);
 }
+
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmplt(a,b)); }
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) { return reinterpret_cast<Packet4i>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) { return reinterpret_cast<Packet8s>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) { return reinterpret_cast<Packet8us>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) { return reinterpret_cast<Packet16c>(vec_cmpeq(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmple(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmplt(a,b)); }
+template<> EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) { return reinterpret_cast<Packet16uc>(vec_cmpeq(a,b)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
@@ -906,8 +928,8 @@ template<> EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, con
   return pxor<Packet8us>(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
+template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_andc(a, b); }
+template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_andc(a, b); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
   return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
@@ -956,7 +978,7 @@ template<typename Packet> EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPAC
   return (Packet) vec_perm(MSQ, LSQ, mask);           // align the data
 #else
   EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_xl(0, from);
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
 #endif
 }
 
@@ -1264,15 +1286,15 @@ EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f){
   Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_max_exp);
   Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
 
-  Packet4bi is_mant_not_zero = vec_cmpne(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
-  Packet4ui nan_selector = pand<Packet4ui>(
+  Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
+  Packet4ui nan_selector = pandnot<Packet4ui>(
       reinterpret_cast<Packet4ui>(is_max_exp),
-      reinterpret_cast<Packet4ui>(is_mant_not_zero)
+      reinterpret_cast<Packet4ui>(is_mant_zero)
   );
 
-  Packet4ui subnormal_selector = pand<Packet4ui>(
+  Packet4ui subnormal_selector = pandnot<Packet4ui>(
       reinterpret_cast<Packet4ui>(is_zero_exp),
-      reinterpret_cast<Packet4ui>(is_mant_not_zero)
+      reinterpret_cast<Packet4ui>(is_mant_zero)
   );
 
   const _EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
@@ -1336,16 +1358,6 @@ template<> EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, con
   BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf> (const Packet8bf& a){
-  BF16_TO_F32_UNARY_OP_WRAPPER(vec_sqrt, a);
-}
-template<> EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf> (const Packet8bf& a){
-  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
-}
-template<> EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf> (const Packet8bf& a){
-  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
-}
-
 template<> EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
   return pldexp_generic(a,exponent);
 }
@@ -1411,6 +1423,9 @@ template<> EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, con
 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
 }
+template<> EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
+}
 template<> EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
   BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
 }
@@ -2260,7 +2275,8 @@ static Packet2ul p2ul_SIGN = { 0x8000000000000000ull, 0x8000000000000000ull };
 static Packet2ul p2ul_PREV0DOT5 = { 0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull };
 static Packet2d  p2d_ONE  = { 1.0, 1.0 };
 static Packet2d  p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
-static Packet2d  p2d_MZERO = { -0.0, -0.0 };
+static Packet2d  p2d_MZERO = { numext::bit_cast<double>(0x8000000000000000ull),
+                               numext::bit_cast<double>(0x8000000000000000ull) };
 
 #ifdef _BIG_ENDIAN
 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
@@ -2295,7 +2311,11 @@ template<> struct packet_traits<double> : default_packet_traits
     HasLog  = 0,
     HasExp  = 1,
     HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
     HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
     HasRound = 1,
     HasFloor = 1,
     HasCeil = 1,
@@ -2384,7 +2404,14 @@ template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; }
 
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; }
+template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
+{
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return vec_xor(a, p2d_MZERO);
+#endif
+}
 
 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
 
@@ -2453,7 +2480,7 @@ template<> EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a)
 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
 {
   EIGEN_DEBUG_UNALIGNED_LOAD
-  return vec_xl(0, from);
+  return vec_xl(0, const_cast<double*>(from));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*   from)
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index b1618e56784bd3586e2bb4475a2adc09cc0f8687..45f6ddb949fd224b26a2c1ce514af590e8dffb38 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -11,13 +11,24 @@
 #ifndef EIGEN_COMPLEX_CUDA_H
 #define EIGEN_COMPLEX_CUDA_H
 
-// clang-format off
 // Many std::complex methods such as operator+, operator-, operator* and
 // operator/ are not constexpr. Due to this, GCC and older versions of clang do
 // not treat them as device functions and thus Eigen functors making use of
 // these operators fail to compile. Here, we manually specialize these
 // operators and functors for complex types when building for CUDA to enable
 // their use on-device.
+//
+// NOTES:
+//  - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device,
+//    since they are already specialized in the standard. Using them will result
+//    in silent kernel failures.
+//  - Compiling with MSVC and using +=,-=,*=,/=(std::complex<Scalar>) will lead
+//    to duplicate definition errors, since these are already specialized in
+//    Visual Studio's <complex> header (contrary to the standard).  This is
+//    preferable to removing such definitions, which will lead to silent kernel
+//    failures.
+//  - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior
+//    to the first inclusion of <complex>.
 
 #if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
     
@@ -67,27 +78,26 @@ std::complex<T> complex_divide_fast(const std::complex<T>& a, const std::complex
   const T a_imag = numext::imag(a);
   const T b_real = numext::real(b);
   const T b_imag = numext::imag(b);
-  const T norm = T(1) / (b_real * b_real + b_imag * b_imag);
-  return std::complex<T>((a_real * b_real + a_imag * b_imag) * norm,
-                          (a_imag * b_real - a_real * b_imag) * norm);
+  const T norm = (b_real * b_real + b_imag * b_imag);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm,
+                          (a_imag * b_real - a_real * b_imag) / norm);
 }
 
 template<typename T>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 std::complex<T> complex_divide_stable(const std::complex<T>& a, const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
   const T b_real = numext::real(b);
   const T b_imag = numext::imag(b);
-  // Guard against over/under-flow.
-  const T scale = T(1) / (numext::abs(b_real) + numext::abs(b_imag));
-  const T a_real_scaled = numext::real(a) * scale;
-  const T a_imag_scaled = numext::imag(a) * scale;
-  const T b_real_scaled = b_real * scale;
-  const T b_imag_scaled = b_imag * scale;
-  
-  const T b_norm2_scaled = b_real_scaled * b_real_scaled + b_imag_scaled * b_imag_scaled;
-  return std::complex<T>(
-      (a_real_scaled * b_real_scaled + a_imag_scaled * b_imag_scaled) / b_norm2_scaled,
-      (a_imag_scaled * b_real_scaled - a_real_scaled * b_imag_scaled) / b_norm2_scaled);
+  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
+  // guards against over/under-flow.
+  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
+  const T rscale = scale_imag ? T(1) : b_real / b_imag;
+  const T iscale = scale_imag ? b_imag / b_real : T(1);
+  const T denominator = b_real * rscale + b_imag * iscale;
+  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator, 
+                         (a_imag * rscale - a_real * iscale) / denominator);
 }
 
 template<typename T>
diff --git a/Eigen/src/Core/arch/Default/BFloat16.h b/Eigen/src/Core/arch/Default/BFloat16.h
index aac60f15cc2801c1aacdc50053e38fdfc433decd..f21d1a0a32ccea5f965d9a2a1f8e42449399d661 100644
--- a/Eigen/src/Core/arch/Default/BFloat16.h
+++ b/Eigen/src/Core/arch/Default/BFloat16.h
@@ -250,17 +250,8 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const
   if (Eigen::numext::isnan EIGEN_NOT_A_MACRO(v)) {
     output.value = std::signbit(v) ? 0xFFC0: 0x7FC0;
     return output;
-  } else if (std::fabs(v) < std::numeric_limits<float>::min EIGEN_NOT_A_MACRO()) {
-    // Flush denormal to +/- 0.
-    output.value = std::signbit(v) ? 0x8000 : 0;
-    return output;
   }
-  const uint16_t* p = reinterpret_cast<const uint16_t*>(&v);
-#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-  output.value = p[0];
-#else
-  output.value = p[1];
-#endif
+  output.value = static_cast<numext::uint16_t>(numext::bit_cast<numext::uint32_t>(v) >> 16);
   return output;
 }
 
@@ -288,9 +279,6 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<fals
     // qNaN magic: All exponent bits set + most significant bit of fraction
     // set.
     output.value = std::signbit(ff) ? 0xFFC0: 0x7FC0;
-  } else if (std::fabs(ff) < std::numeric_limits<float>::min EIGEN_NOT_A_MACRO()) {
-    // Flush denormal to +/- 0.0
-    output.value = std::signbit(ff) ? 0x8000 : 0;
   } else {
     // Fast rounding algorithm that rounds a half value to nearest even. This
     // reduces expected error when we convert a large number of floats. Here
@@ -469,14 +457,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true
 }
 
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
-    float result = 0;
-    unsigned short* q = reinterpret_cast<unsigned short*>(&result);
-#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-    q[0] = h.value;
-#else
-    q[1] = h.value;
-#endif
-    return result;
+    return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
 }
 // --- standard functions ---
 
diff --git a/Eigen/src/Core/arch/Default/ConjHelper.h b/Eigen/src/Core/arch/Default/ConjHelper.h
index 4cfe34e05268c459645c00449b0a92b23f1d0c2b..53830b5a274c72546315eaea5d1fb4fb737e3a8b 100644
--- a/Eigen/src/Core/arch/Default/ConjHelper.h
+++ b/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -11,19 +11,107 @@
 #ifndef EIGEN_ARCH_CONJ_HELPER_H
 #define EIGEN_ARCH_CONJ_HELPER_H
 
-#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)                                                          \
-  template<> struct conj_helper<PACKET_REAL, PACKET_CPLX, false,false> {                                          \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \
-    { return padd(c, pmul(x,y)); }                                                                                \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const                        \
-    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v)); }                                           \
-  };                                                                                                              \
-                                                                                                                  \
-  template<> struct conj_helper<PACKET_CPLX, PACKET_REAL, false,false> {                                          \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \
-    { return padd(c, pmul(x,y)); }                                                                                \
-    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const                        \
-    { return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y)); }                                           \
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)      \
+  template <>                                                           \
+  struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> {          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x,         \
+                                          const PACKET_CPLX& y,         \
+                                          const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                 \
+    }                                                                   \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x,          \
+                                         const PACKET_CPLX& y) const {  \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v));   \
+    }                                                                   \
+  };                                                                    \
+                                                                        \
+  template <>                                                           \
+  struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> {          \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x,         \
+                                          const PACKET_REAL& y,         \
+                                          const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                 \
+    }                                                                   \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x,          \
+                                         const PACKET_REAL& y) const {  \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y));   \
+    }                                                                   \
   };
 
-#endif // EIGEN_ARCH_CONJ_HELPER_H
+namespace Eigen {
+namespace internal {
+
+template<bool Conjugate> struct conj_if;
+
+template<> struct conj_if<true> {
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return numext::conj(x); }
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const { return internal::pconj(x); }
+};
+
+template<> struct conj_if<false> {
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const { return x; }
+  template<typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const { return x; }
+};
+
+// Generic Implementation, assume scalars since the packet-version is
+// specialized below.
+template<typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
+struct conj_helper {
+  typedef typename ScalarBinaryOpTraits<LhsType, RhsType>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmadd(const LhsType& x, const RhsType& y, const ResultType& c) const
+  { return this->pmul(x, y) + c; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmul(const LhsType& x, const RhsType& y) const
+  { return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y); }
+};
+
+template<typename LhsScalar, typename RhsScalar>
+struct conj_helper<LhsScalar, RhsScalar, true, true> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmadd(const LhsScalar& x, const RhsScalar& y, const ResultType& c) const
+  { return this->pmul(x, y) + c; }
+
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType
+  pmul(const LhsScalar& x, const RhsScalar& y) const
+  { return numext::conj(x * y); }
+};
+
+// Implementation with equal type, use packet operations.
+template<typename Packet, bool ConjLhs, bool ConjRhs>
+struct conj_helper<Packet, Packet, ConjLhs, ConjRhs>
+{
+  typedef Packet ResultType;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
+  { return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c); }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
+  { return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y)); }
+};
+
+template<typename Packet>
+struct conj_helper<Packet, Packet, true, true>
+{
+  typedef Packet ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const
+  { return Eigen::internal::pmadd(pconj(x), pconj(y), c); }
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const
+  { return pconj(Eigen::internal::pmul(x, y)); }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_ARCH_CONJ_HELPER_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index 87e8c27033b27466ba36a312c684ed61ef06de9d..95fb686a1cae36fef9277594042f7139ec0b5777 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -19,12 +19,6 @@
 namespace Eigen {
 namespace internal {
 
-template<typename Packet, int N> EIGEN_DEVICE_FUNC inline Packet
-pset(const typename unpacket_traits<Packet>::type (&a)[N] /* a */) {
-  EIGEN_STATIC_ASSERT(unpacket_traits<Packet>::size == N, THE_ARRAY_SIZE_SHOULD_EQUAL_WITH_PACKET_SIZE);
-  return pload<Packet>(a);
-}
-
 // Creates a Scalar integer type with same bit-width.
 template<typename T> struct make_integer;
 template<> struct make_integer<float>    { typedef numext::int32_t type; };
@@ -808,9 +802,8 @@ Packet psqrt_complex(const Packet& a) {
   //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
   // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
 
-  Packet a_flip = pcplxflip(a);
   RealPacket a_abs = pabs(a.v);           // [|x0|, |y0|, |x1|, |y1|]
-  RealPacket a_abs_flip = pabs(a_flip.v); // [|y0|, |x0|, |y1|, |x1|]
+  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v; // [|y0|, |x0|, |y1|, |x1|]
   RealPacket a_max = pmax(a_abs, a_abs_flip);
   RealPacket a_min = pmin(a_abs, a_abs_flip);
   RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
@@ -839,7 +832,8 @@ Packet psqrt_complex(const Packet& a) {
 
   // Step 4. Compute solution for inputs with negative real part:
   //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
-  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
+  const RealScalar neg_zero = RealScalar(numext::bit_cast<float>(0x80000000u));
+  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), neg_zero)).v;
   RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
   Packet negative_real_result;
   // Notice that rho is positive, so taking it's absolute value is a noop.
@@ -1449,39 +1443,40 @@ EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
 }
 
 // Generic implementation of pow(x,y).
-template<typename Packet>
-EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-EIGEN_UNUSED
-Packet generic_pow(const Packet& x, const Packet& y) {
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_pow(const Packet& x, const Packet& y) {
   typedef typename unpacket_traits<Packet>::type Scalar;
 
   const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_neg_inf = pset1<Packet>(-NumTraits<Scalar>::infinity());
   const Packet cst_zero = pset1<Packet>(Scalar(0));
   const Packet cst_one = pset1<Packet>(Scalar(1));
   const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
 
   const Packet abs_x = pabs(x);
   // Predicates for sign and magnitude of x.
-  const Packet x_is_zero = pcmp_eq(x, cst_zero);
-  const Packet x_is_neg = pcmp_lt(x, cst_zero);
+  const Packet abs_x_is_zero = pcmp_eq(abs_x, cst_zero);
+  const Packet x_has_signbit = pcmp_eq(por(pand(x, cst_neg_inf), cst_pos_inf), cst_neg_inf);
+  const Packet x_is_neg = pandnot(x_has_signbit, abs_x_is_zero);
+  const Packet x_is_neg_zero = pand(x_has_signbit, abs_x_is_zero);
   const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
-  const Packet abs_x_is_one =  pcmp_eq(abs_x, cst_one);
+  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_one);
   const Packet abs_x_is_gt_one = pcmp_lt(cst_one, abs_x);
   const Packet abs_x_is_lt_one = pcmp_lt(abs_x, cst_one);
-  const Packet x_is_one =  pandnot(abs_x_is_one, x_is_neg);
-  const Packet x_is_neg_one =  pand(abs_x_is_one, x_is_neg);
+  const Packet x_is_one = pandnot(abs_x_is_one, x_is_neg);
+  const Packet x_is_neg_one = pand(abs_x_is_one, x_is_neg);
   const Packet x_is_nan = pandnot(ptrue(x), pcmp_eq(x, x));
 
   // Predicates for sign and magnitude of y.
+  const Packet abs_y = pabs(y);
   const Packet y_is_one = pcmp_eq(y, cst_one);
-  const Packet y_is_zero = pcmp_eq(y, cst_zero);
+  const Packet abs_y_is_zero = pcmp_eq(abs_y, cst_zero);
   const Packet y_is_neg = pcmp_lt(y, cst_zero);
-  const Packet y_is_pos = pandnot(ptrue(y), por(y_is_zero, y_is_neg));
+  const Packet y_is_pos = pandnot(ptrue(y), por(abs_y_is_zero, y_is_neg));
   const Packet y_is_nan = pandnot(ptrue(y), pcmp_eq(y, y));
-  const Packet abs_y_is_inf = pcmp_eq(pabs(y), cst_pos_inf);
+  const Packet abs_y_is_inf = pcmp_eq(abs_y, cst_pos_inf);
   EIGEN_CONSTEXPR Scalar huge_exponent =
-      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) /
-       NumTraits<Scalar>::epsilon();
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
   const Packet abs_y_is_huge = pcmp_le(pset1<Packet>(huge_exponent), pabs(y));
 
   // Predicates for whether y is integer and/or even.
@@ -1490,39 +1485,31 @@ Packet generic_pow(const Packet& x, const Packet& y) {
   const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
 
   // Predicates encoding special cases for the value of pow(x,y)
-  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf),
-                                                    y_is_int),
-                                            abs_y_is_inf);
-  const Packet pow_is_one = por(por(x_is_one, y_is_zero),
-                                pand(x_is_neg_one,
-                                     por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
+  const Packet invalid_negative_x = pandnot(pandnot(pandnot(x_is_neg, abs_x_is_inf), y_is_int), abs_y_is_inf);
   const Packet pow_is_nan = por(invalid_negative_x, por(x_is_nan, y_is_nan));
-  const Packet pow_is_zero = por(por(por(pand(x_is_zero, y_is_pos),
-                                         pand(abs_x_is_inf, y_is_neg)),
-                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge),
-                                          y_is_pos)),
-                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge),
-                                      y_is_neg));
-  const Packet pow_is_inf = por(por(por(pand(x_is_zero, y_is_neg),
-                                        pand(abs_x_is_inf, y_is_pos)),
-                                    pand(pand(abs_x_is_lt_one, abs_y_is_huge),
-                                         y_is_neg)),
-                                pand(pand(abs_x_is_gt_one, abs_y_is_huge),
-                                     y_is_pos));
+  const Packet pow_is_one =
+      por(por(x_is_one, abs_y_is_zero), pand(x_is_neg_one, por(abs_y_is_inf, pandnot(y_is_even, invalid_negative_x))));
+  const Packet pow_is_zero = por(por(por(pand(abs_x_is_zero, y_is_pos), pand(abs_x_is_inf, y_is_neg)),
+                                     pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_pos)),
+                                 pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_neg));
+  const Packet pow_is_inf = por(por(por(pand(abs_x_is_zero, y_is_neg), pand(abs_x_is_inf, y_is_pos)),
+                                    pand(pand(abs_x_is_lt_one, abs_y_is_huge), y_is_neg)),
+                                pand(pand(abs_x_is_gt_one, abs_y_is_huge), y_is_pos));
+  const Packet inf_val =
+      pselect(pandnot(pand(por(pand(abs_x_is_inf, x_is_neg), pand(x_is_neg_zero, y_is_neg)), y_is_int), y_is_even),
+              cst_neg_inf, cst_pos_inf);
 
   // General computation of pow(x,y) for positive x or negative x and integer y.
   const Packet negate_pow_abs = pandnot(x_is_neg, y_is_even);
   const Packet pow_abs = generic_pow_impl(abs_x, y);
-  return pselect(y_is_one, x,
-                 pselect(pow_is_one, cst_one,
-                         pselect(pow_is_nan, cst_nan,
-                                 pselect(pow_is_inf, cst_pos_inf,
-                                         pselect(pow_is_zero, cst_zero,
-                                                 pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
+  return pselect(
+      y_is_one, x,
+      pselect(pow_is_one, cst_one,
+              pselect(pow_is_nan, cst_nan,
+                      pselect(pow_is_inf, inf_val,
+                              pselect(pow_is_zero, cst_zero, pselect(negate_pow_abs, pnegate(pow_abs), pow_abs))))));
 }
 
-
-
 /* polevl (modified for Eigen)
  *
  *      Evaluate polynomial
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
index 637e5f4afcc0fb8ebbe7158a0bb1e3daa2d2b1ee..177a04e93e646cf735e1816857ea16de1cd8a4da 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -17,10 +17,6 @@ namespace internal {
 // implemented in GenericPacketMathFunctions.h
 // This is needed to workaround a circular dependency.
 
-/** \internal \returns a packet with constant coefficients \a a, e.g.: (a[N-1],...,a[0]) */
-template<typename Packet, int N> EIGEN_DEVICE_FUNC inline Packet
-pset(const typename unpacket_traits<Packet>::type (&a)[N] /* a */);
-
 /***************************************************************************
  * Some generic implementations to be used by implementors
 ***************************************************************************/
diff --git a/Eigen/src/Core/arch/Default/Half.h b/Eigen/src/Core/arch/Default/Half.h
index 9f8e8cc1e7a614cf86b4d4e7065df97fb06404c3..6e2b31f7625407974230797e712c73aa0c4b0ae9 100644
--- a/Eigen/src/Core/arch/Default/Half.h
+++ b/Eigen/src/Core/arch/Default/Half.h
@@ -36,8 +36,6 @@
 #ifndef EIGEN_HALF_H
 #define EIGEN_HALF_H
 
-#include <sstream>
-
 #if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
 // When compiling with GPU support, the "__half_raw" base class as well as
 // some other routines are defined in the GPU compiler header files
@@ -334,7 +332,7 @@ EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) {
 }
 #endif
 
-#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) {
   return half(vaddh_f16(a.x, b.x));
 }
@@ -534,7 +532,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
 
 #elif defined(EIGEN_HAS_FP16_C)
   __half_raw h;
-  h.x = _cvtss_sh(ff, 0);
+  #if EIGEN_COMP_MSVC
+    // MSVC does not have scalar instructions.
+    h.x =_mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0);
+  #else
+    h.x = _cvtss_sh(ff, 0);
+  #endif
   return h;
 
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
@@ -595,7 +598,12 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
   (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
   return __half2float(h);
 #elif defined(EIGEN_HAS_FP16_C)
-  return _cvtsh_ss(h.x);
+  #if EIGEN_COMP_MSVC
+    // MSVC does not have scalar instructions.
+    return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x)));
+  #else
+    return _cvtsh_ss(h.x);
+  #endif
 #elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
   return static_cast<float>(h.x);
 #else
diff --git a/Eigen/src/Core/arch/GPU/PacketMath.h b/Eigen/src/Core/arch/GPU/PacketMath.h
index 689110dede13d0e26eb181e4de03cf3a8bc011c1..bfc11efbca83431f6b3839a06e15796ad0853bea 100644
--- a/Eigen/src/Core/arch/GPU/PacketMath.h
+++ b/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -121,7 +121,6 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const do
 // invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
 // of the functions, while the latter can only deal with one of them.
 #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
-namespace {
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
                                                         const float& b) {
@@ -180,8 +179,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
   return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
 }
 
-}  // namespace
-
 template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
                                                           const float4& b) {
@@ -493,9 +490,10 @@ ptranspose(PacketBlock<double2,2>& kernel) {
 
 #endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
 
-// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
-// its corresponding packet_traits<Eigen::half> must be visible on host.
-#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
+// on device. There is no benefit to using them on the host anyways, since they are
+// emulated.
+#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
 
 typedef ulonglong2 Packet4h2;
 template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
@@ -526,42 +524,9 @@ template<> struct packet_traits<Eigen::half> : default_packet_traits
   };
 };
 
-namespace {
-// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return __halves2half2(a, b);
-#else
-  // Round-about way since __halves2half2 is a __device__ function.
-  return __floats2half2_rn(__half2float(a), __half2float(b));
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return __low2half(a);
-#else
-  return __float2half(__low2float(a));
-#endif
-}
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
-  return __high2half(a);
-#else
-  return __float2half(__high2float(a));
-#endif
-}
-} // namespace
-
 template<>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
-#if defined(EIGEN_GPU_COMPILE_PHASE)
   return __half2half2(from);
-#else
-  const float f = __half2float(from);
-  return __floats2half2_rn(f, f);
-#endif
 }
 
 template <>
@@ -576,8 +541,6 @@ pset1<Packet4h2>(const Eigen::half& from) {
   return r;
 }
 
-// We now need this visible on both host and device.
-// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
 namespace {
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
@@ -585,11 +548,11 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
-  return combine_half(from[0], from[1]);
+  return __halves2half2(from[0], from[1]);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half*  from) {
-  return combine_half(from[0], from[0]);
+  return __halves2half2(from[0], from[0]);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
@@ -599,8 +562,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
                                                    const half2& from) {
-  to[0] = get_half2_low(from);
-  to[1] = get_half2_high(from);
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
 }
 
 
@@ -610,7 +573,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
   // Input is guaranteed to be properly aligned.
   return __ldg(reinterpret_cast<const half2*>(from));
 #else
-  return combine_half(*(from+0), *(from+1));
+  return __halves2half2(*(from+0), *(from+1));
 #endif
 }
 
@@ -619,31 +582,31 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
 #if defined(EIGEN_GPU_HAS_LDG)
   return __halves2half2(__ldg(from+0), __ldg(from+1));
 #else
-  return combine_half(*(from+0), *(from+1));
+  return __halves2half2(*(from+0), *(from+1));
 #endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
                                                     Index stride) {
-  return combine_half(from[0*stride], from[1*stride]);
+  return __halves2half2(from[0*stride], from[1*stride]);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
     Eigen::half* to, const half2& from, Index stride) {
-  to[stride*0] = get_half2_low(from);
-  to[stride*1] = get_half2_high(from);
+  to[stride*0] = __low2half(from);
+  to[stride*1] = __high2half(from);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
-  return get_half2_low(a);
+  return __low2half(a);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
   half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
   half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
@@ -658,12 +621,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<half2,2>& kernel) {
-  __half a1 = get_half2_low(kernel.packet[0]);
-  __half a2 = get_half2_high(kernel.packet[0]);
-  __half b1 = get_half2_low(kernel.packet[1]);
-  __half b2 = get_half2_high(kernel.packet[1]);
-  kernel.packet[0] = combine_half(a1, b1);
-  kernel.packet[1] = combine_half(a2, b2);
+  __half a1 = __low2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
+  __half b1 = __low2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
@@ -671,88 +634,88 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
   return __halves2half2(a, __hadd(a, __float2half(1.0f)));
 #else
   float f = __half2float(a) + 1.0f;
-  return combine_half(a, __float2half(f));
+  return __halves2half2(a, __float2half(f));
 #endif
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
                                                     const half2& a,
                                                     const half2& b) {
-  half mask_low = get_half2_low(mask);
-  half mask_high = get_half2_high(mask);
-  half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
-  half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
-  return combine_half(result_low, result_high);
+  half mask_low = __low2half(mask);
+  half mask_high = __high2half(mask);
+  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
+  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
+  return __halves2half2(result_low, result_high);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
                                                     const half2& b) {
   half true_half = half_impl::raw_uint16_to_half(0xffffu);
   half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
   half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
   half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
-  return combine_half(eq1, eq2);
+  return __halves2half2(eq1, eq2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
                                                     const half2& b) {
   half true_half = half_impl::raw_uint16_to_half(0xffffu);
   half false_half = half_impl::raw_uint16_to_half(0x0000u);
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
   half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
   half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
-  return combine_half(eq1, eq2);
+  return __halves2half2(eq1, eq2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
                                                  const half2& b) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
   half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
   half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
                                                 const half2& b) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
   half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
   half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
                                                  const half2& b) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
   half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
   half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
                                                     const half2& b) {
-  half a1 = get_half2_low(a);
-  half a2 = get_half2_high(a);
-  half b1 = get_half2_low(b);
-  half b2 = get_half2_high(b);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
   half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
   half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
-  return combine_half(result1, result2);
+  return __halves2half2(result1, result2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
@@ -851,9 +814,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
-  __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
-  return combine_half(r1, r2);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
@@ -862,9 +825,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
-  __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
-  return combine_half(r1, r2);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
@@ -885,7 +848,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
 #else
   float a1 = __low2float(a);
   float a2 = __high2float(a);
-  return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
 #endif
 }
 
@@ -897,7 +860,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
 #else
   float a1 = __low2float(a);
   float a2 = __high2float(a);
-  return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
 #endif
 }
 
@@ -1068,10 +1031,10 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
   Packet4h2 r;
   half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
-  p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
-  p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
-  p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
+  p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
+  p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
+  p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
+  p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
   return r;
 }
 
@@ -1152,12 +1115,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose_half(half2& f0, half2& f1) {
-  __half a1 = get_half2_low(f0);
-  __half a2 = get_half2_high(f0);
-  __half b1 = get_half2_low(f1);
-  __half b2 = get_half2_high(f1);
-  f0 = combine_half(a1, b1);
-  f1 = combine_half(a2, b2);
+  __half a1 = __low2half(f0);
+  __half a2 = __high2half(f0);
+  __half b1 = __low2half(f1);
+  __half b2 = __high2half(f1);
+  f0 = __halves2half2(a1, b1);
+  f1 = __halves2half2(a2, b2);
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
@@ -1254,10 +1217,10 @@ plset<Packet4h2>(const Eigen::half& a) {
   float f = __half2float(a);
   Packet4h2 r;
   half2* p_alias = reinterpret_cast<half2*>(&r);
-  p_alias[0] = combine_half(a, __float2half(f + 1.0f));
-  p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
-  p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
-  p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
+  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
+  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
+  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
+  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
   return r;
 #endif
 }
@@ -1477,9 +1440,9 @@ template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
     const Packet4h2& a) {
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = combine_half(predux_max(a_alias[0]),
+  half2 m0 = __halves2half2(predux_max(a_alias[0]),
                             predux_max(a_alias[1]));
-  half2 m1 = combine_half(predux_max(a_alias[2]),
+  half2 m1 = __halves2half2(predux_max(a_alias[2]),
                             predux_max(a_alias[3]));
   __half first  = predux_max(m0);
   __half second = predux_max(m1);
@@ -1496,9 +1459,9 @@ template <>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
     const Packet4h2& a) {
   const half2* a_alias = reinterpret_cast<const half2*>(&a);
-  half2 m0 = combine_half(predux_min(a_alias[0]),
+  half2 m0 = __halves2half2(predux_min(a_alias[0]),
                             predux_min(a_alias[1]));
-  half2 m1 = combine_half(predux_min(a_alias[2]),
+  half2 m1 = __halves2half2(predux_min(a_alias[2]),
                             predux_min(a_alias[3]));
   __half first  = predux_min(m0);
   __half second = predux_min(m1);
@@ -1652,9 +1615,9 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
-  __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
-  __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
-  return combine_half(r1, r2);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
 }
 
 template<>
@@ -1664,14 +1627,12 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
   float a2 = __high2float(a);
   float b1 = __low2float(b);
   float b2 = __high2float(b);
-  __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
-  __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
-  return combine_half(r1, r2);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
 }
 
-// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
-
-#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+#endif // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
 
 #undef EIGEN_GPU_HAS_LDG
 #undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
diff --git a/Eigen/src/Core/arch/GPU/TypeCasting.h b/Eigen/src/Core/arch/GPU/TypeCasting.h
index 75454622559cece4eed6303bed8fdb37ab2aecd8..c8195bb2b094633dc03cc71c0144ce43f23a6d66 100644
--- a/Eigen/src/Core/arch/GPU/TypeCasting.h
+++ b/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -15,8 +15,7 @@ namespace Eigen {
 namespace internal {
 
 #if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
-  (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
-
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
 
 template <>
 struct type_casting_traits<Eigen::half, float> {
diff --git a/Eigen/src/Core/arch/MSA/Complex.h b/Eigen/src/Core/arch/MSA/Complex.h
index 4877a95a8d042e827b5809b5cfd94830690b9623..53dacfa43d8932e31c2ba1cf5dac9a09bef3237e 100644
--- a/Eigen/src/Core/arch/MSA/Complex.h
+++ b/Eigen/src/Core/arch/MSA/Complex.h
@@ -305,42 +305,6 @@ EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a
                              (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
 }
 
-template <>
-struct conj_helper<Packet2cf, Packet2cf, false, true> {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
-                                      const Packet2cf& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template <>
-struct conj_helper<Packet2cf, Packet2cf, true, false> {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
-                                      const Packet2cf& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template <>
-struct conj_helper<Packet2cf, Packet2cf, true, true> {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y,
-                                      const Packet2cf& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
 
 template <>
@@ -644,42 +608,6 @@ EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd&
   return pfirst(a);
 }
 
-template <>
-struct conj_helper<Packet1cd, Packet1cd, false, true> {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
-                                      const Packet1cd& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template <>
-struct conj_helper<Packet1cd, Packet1cd, true, false> {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
-                                      const Packet1cd& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template <>
-struct conj_helper<Packet1cd, Packet1cd, true, true> {
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y,
-                                      const Packet1cd& c) const {
-    return padd(pmul(x, y), c);
-  }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
 
 template <>
diff --git a/Eigen/src/Core/arch/MSA/PacketMath.h b/Eigen/src/Core/arch/MSA/PacketMath.h
index f03cf61ffb6980691642730c79655da26c5c44db..afe8f3375ba9bcfcaf208222ac3fd52ab03433d8 100644
--- a/Eigen/src/Core/arch/MSA/PacketMath.h
+++ b/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -28,10 +28,6 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #endif
diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h
index 1aa361bc063eaa97542c67d013ad96927b679286..a58f13ca85028efeb1c57057964753176d48c831 100644
--- a/Eigen/src/Core/arch/NEON/Complex.h
+++ b/Eigen/src/Core/arch/NEON/Complex.h
@@ -124,24 +124,17 @@ template<> EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, con
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 { return Packet2cf(psub<Packet4f>(a.v, b.v)); }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b);
-template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet4f mask = {-0.0f, -0.0f, 0.0f, 0.0f};
-  return Packet2cf(padd(a.v, pxor(mask, b.v)));  
-}
-
 template<> EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) { return Packet1cf(pnegate<Packet2f>(a.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
 
 template<> EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a)
 {
-  const Packet2ui b = vreinterpret_u32_f32(a.v);
+  const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v));
   return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
 }
 template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
 {
-  const Packet4ui b = vreinterpretq_u32_f32(a.v);
+  const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v));
   return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
 }
 
@@ -349,67 +342,13 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   return s;
 }
 
-template<> struct conj_helper<Packet1cf,Packet1cf,false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const
-  { return internal::pmul(a, pconj(b)); }
-};
-
-template<> struct conj_helper<Packet1cf,Packet1cf,true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const
-  { return internal::pmul(pconj(a), b); }
-};
-
-template<> struct conj_helper<Packet1cf,Packet1cf,true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cf pmadd(const Packet1cf& x, const Packet1cf& y, const Packet1cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cf pmul(const Packet1cf& a, const Packet1cf& b) const
-  { return pconj(internal::pmul(a,b)); }
-};
-
-template<> struct conj_helper<Packet2cf,Packet2cf,false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  { return internal::pmul(a, pconj(b)); }
-};
-
-template<> struct conj_helper<Packet2cf,Packet2cf,true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  { return internal::pmul(pconj(a), b); }
-};
-
-template<> struct conj_helper<Packet2cf,Packet2cf,true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  { return pconj(internal::pmul(a,b)); }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf,Packet2f)
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b)
 {
   // TODO optimize it for NEON
-  Packet1cf res = conj_helper<Packet1cf, Packet1cf, false, true>().pmul(a,b);
+  Packet1cf res = pmul(a, pconj(b));
   Packet2f s, rev_s;
 
   // this computes the norm
@@ -421,7 +360,7 @@ template<> EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, con
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for NEON
-  Packet2cf res = conj_helper<Packet2cf, Packet2cf, false, true>().pmul(a,b);
+  Packet2cf res = pmul(a,pconj(b));
   Packet4f s, rev_s;
 
   // this computes the norm
@@ -610,39 +549,12 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Pack
 
 template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) { return pfirst(a); }
 
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  { return internal::pmul(a, pconj(b)); }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  { return internal::pmul(pconj(a), b); }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  { return pconj(internal::pmul(a,b)); }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for NEON
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet1cd res = pmul(a,pconj(b));
   Packet2d s = pmul<Packet2d>(b.v, b.v);
   Packet2d rev_s = preverse<Packet2d>(s);
 
diff --git a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
index 3481f337e374c1fc3eb0b9e780ddb02da963a9e5..0963b0f1f0fbd009933fbe38b2327afbba98bf8a 100644
--- a/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -24,7 +24,7 @@ struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
 
   template <typename LaneIdType>
   EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
-                                Packet4f& c, Packet4f& tmp,
+                                Packet4f& c, Packet4f&,
                                 const LaneIdType&) const {
     acc(a, b, c);
   }
diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h
index 9cf4e071210f3ddc09d88985ae33cb724090695c..f6d6d635a605996571fe7df4dc7a5c4ca663d0ad 100644
--- a/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -24,10 +24,6 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #if EIGEN_ARCH_ARM64
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
@@ -36,7 +32,7 @@ namespace internal {
 #endif
 #endif
 
-#if EIGEN_COMP_MSVC
+#if EIGEN_COMP_MSVC_STRICT
 
 // In MSVC's arm_neon.h header file, all NEON vector types
 // are aliases to the same underlying type __n128.
@@ -82,11 +78,21 @@ typedef uint32x4_t                           Packet4ui;
 typedef int64x2_t                            Packet2l;
 typedef uint64x2_t                           Packet2ul;
 
-#endif // EIGEN_COMP_MSVC
+#endif // EIGEN_COMP_MSVC_STRICT
+
+EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
+  float from[4] = {a, b, c, d};
+  return vld1q_f32(from);
+}
+
+EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
+  float from[2] = {a, b};
+  return vld1_f32(from);
+}
 
 EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask){
   const float* a = reinterpret_cast<const float*>(&m);
-  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3))};
+  Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3 )), *(a + ((mask >> 6) & 3)));
   return res;
 }
 
@@ -99,7 +105,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f &m, const Packet4f &n, int
 {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = {*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  Packet4f res = make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
   return res;
 }
 
@@ -108,7 +114,7 @@ EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f &m, const Packet4f &n
 {
   const float* a = reinterpret_cast<const float*>(&m);
   const float* b = reinterpret_cast<const float*>(&n);
-  Packet4f res = {*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3))};
+  Packet4f res = make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
   return res;
 }
 
@@ -139,7 +145,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b
   return shuffle2<true>(a,b,eigen_neon_shuffle_mask(2, 2, 3, 3));
 }
 #define vec4f_duplane(a, p) \
-  vdupq_lane_f32(vget_low_f32(a), p)
+  Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
 
 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
   const Packet4f p4f_##NAME = pset1<Packet4f>(X)
@@ -150,7 +156,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b
 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
   const Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
   // __builtin_prefetch tends to do nothing on ARM64 compilers because the
   // prefetch instructions there are too detailed for __builtin_prefetch to map
   // meaningfully to them.
@@ -159,7 +165,7 @@ EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b
   #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
 #elif defined __pld
   #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif EIGEN_ARCH_ARM32
+#elif EIGEN_ARCH_ARM
   #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : );
 #else
   // by default no explicit prefetching
@@ -866,12 +872,12 @@ template<> EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, con
 
 template<> EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
 template<> EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f & b) {
-  Packet2f mask = {-0.0f, 0.0f};
+  Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
   return padd(a, pxor(mask, b));
 }
 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
 template<> EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
-  Packet4f mask = {-0.0f, 0.0f, -0.0f, 0.0f};
+  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
   return padd(a, pxor(mask, b));
 }
 
@@ -2503,7 +2509,7 @@ template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(co
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a)
 { return vget_lane_f32(a, 0) * vget_lane_f32(a, 1); }
 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{ return predux_mul(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
+{ return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a))); }
 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a)
 {
   int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
@@ -2517,7 +2523,7 @@ template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a)
   return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
 }
 template<> EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a)
-{ return predux_mul(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
+{ return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a))); }
 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a)
 {
   uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
@@ -2531,7 +2537,7 @@ template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a)
   return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
 }
 template<> EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a)
-{ return predux_mul(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
+{ return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a))); }
 template<> EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a)
 {
   const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
@@ -2567,11 +2573,11 @@ template<> EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a
 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a)
 { return vget_lane_s32(a, 0) * vget_lane_s32(a, 1); }
 template<> EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a)
-{ return predux_mul(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
+{ return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a))); }
 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a)
 { return vget_lane_u32(a, 0) * vget_lane_u32(a, 1); }
 template<> EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a)
-{ return predux_mul(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
+{ return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a))); }
 template<> EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a)
 { return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1); }
 template<> EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a)
@@ -2774,352 +2780,265 @@ template<> EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x)
   return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel)
-{
-  const float32x2x2_t z = vzip_f32(kernel.packet[0], kernel.packet[1]);
-  kernel.packet[0] = z.val[0];
-  kernel.packet[1] = z.val[1];
+// Helpers for ptranspose.
+namespace detail {
+  
+template<typename Packet>
+void zip_in_place(Packet& p1, Packet& p2);
+
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
+  const float32x2x2_t tmp = vzip_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel)
-{
-  const float32x4x2_t tmp1 = vzipq_f32(kernel.packet[0], kernel.packet[1]);
-  const float32x4x2_t tmp2 = vzipq_f32(kernel.packet[2], kernel.packet[3]);
 
-  kernel.packet[0] = vcombine_f32(vget_low_f32(tmp1.val[0]), vget_low_f32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_f32(vget_high_f32(tmp1.val[0]), vget_high_f32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_f32(vget_low_f32(tmp1.val[1]), vget_low_f32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_f32(vget_high_f32(tmp1.val[1]), vget_high_f32(tmp2.val[1]));
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
+  const float32x4x2_t tmp = vzipq_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
-{
-  const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
-  const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
 
-  const int8x8x2_t zip8 = vzip_s8(a,b);
-  const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
+  const int8x8x2_t tmp = vzip_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
 
-  kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
-  kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
-  kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
-  kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
+  const int8x16x2_t tmp = vzipq_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel)
-{
-  int8x8x2_t zip8[4];
-  uint16x4x2_t zip16[4];
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-    zip8[i] = vzip_s8(kernel.packet[i*2], kernel.packet[i*2+1]);
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
+  const uint8x8x2_t tmp = vzip_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-      zip16[i*2+j] = vzip_u16(vreinterpret_u16_s8(zip8[i*2].val[j]), vreinterpret_u16_s8(zip8[i*2+1].val[j]));
-  }
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
+  const uint8x16x2_t tmp = vzipq_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j]));
-      EIGEN_UNROLL_LOOP
-      for (int k = 0; k != 2; k++)
-        kernel.packet[i*4+j*2+k] = vreinterpret_s8_u32(z.val[k]);
-    }
-  }
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
+  const int32x2x2_t tmp = vzip_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel)
-{
-  int8x16x2_t zip8[8];
-  uint16x8x2_t zip16[8];
-  uint32x4x2_t zip32[8];
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 8; i++)
-    zip8[i] = vzipq_s8(kernel.packet[i*2], kernel.packet[i*2+1]);
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
+  const int32x4x2_t tmp = vzipq_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_s8(zip8[i*2].val[j]),
-          vreinterpretq_u16_s8(zip8[i*2+1].val[j]));
-    }
-  }
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
+  const uint32x2x2_t tmp = vzip_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      EIGEN_UNROLL_LOOP
-      for (int k = 0; k != 2; k++)
-        zip32[i*4+j*2+k] = vzipq_u32(vreinterpretq_u32_u16(zip16[i*4+j].val[k]),
-            vreinterpretq_u32_u16(zip16[i*4+j+2].val[k]));
-    }
-  }
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
+  const uint32x4x2_t tmp = vzipq_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      kernel.packet[i*4+j*2] = vreinterpretq_s8_u32(vcombine_u32(vget_low_u32(zip32[i].val[j]),
-          vget_low_u32(zip32[i+4].val[j])));
-      kernel.packet[i*4+j*2+1] = vreinterpretq_s8_u32(vcombine_u32(vget_high_u32(zip32[i].val[j]),
-          vget_high_u32(zip32[i+4].val[j])));
-    }
-  }
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
+  const int16x4x2_t tmp = vzip_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
-{
-  const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
-  const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
 
-  const uint8x8x2_t zip8 = vzip_u8(a,b);
-  const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
+  const int16x8x2_t tmp = vzipq_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
 
-  kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
-  kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
-  kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
-  kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel)
-{
-  uint8x8x2_t zip8[4];
-  uint16x4x2_t zip16[4];
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-    zip8[i] = vzip_u8(kernel.packet[i*2], kernel.packet[i*2+1]);
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
+  const uint16x8x2_t tmp = vzipq_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-      zip16[i*2+j] = vzip_u16(vreinterpret_u16_u8(zip8[i*2].val[j]), vreinterpret_u16_u8(zip8[i*2+1].val[j]));
-  }
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      const uint32x2x2_t z = vzip_u32(vreinterpret_u32_u16(zip16[i].val[j]), vreinterpret_u32_u16(zip16[i+2].val[j]));
-      EIGEN_UNROLL_LOOP
-      for (int k = 0; k != 2; k++)
-        kernel.packet[i*4+j*2+k] = vreinterpret_u8_u32(z.val[k]);
-    }
-  }
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel)
-{
-  uint8x16x2_t zip8[8];
-  uint16x8x2_t zip16[8];
-  uint32x4x2_t zip32[8];
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 8; i++)
-    zip8[i] = vzipq_u8(kernel.packet[i*2], kernel.packet[i*2+1]);
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[4]);
+  zip_in_place(kernel.packet[1], kernel.packet[5]);
+  zip_in_place(kernel.packet[2], kernel.packet[6]);
+  zip_in_place(kernel.packet[3], kernel.packet[7]);
 
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-      zip16[i*2+j] = vzipq_u16(vreinterpretq_u16_u8(zip8[i*2].val[j]),
-          vreinterpretq_u16_u8(zip8[i*2+1].val[j]));
-  }
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[6]);
+  zip_in_place(kernel.packet[5], kernel.packet[7]);
+  
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[5]);
+  zip_in_place(kernel.packet[6], kernel.packet[7]);
+}
 
+template<typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
   EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 2; i++)
-  {
+  for (int i=0; i<4; ++i) {
+    const int m = (1 << i);
     EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
+    for (int j=0; j<m; ++j) {
+      const int n = (1 << (3-i));
       EIGEN_UNROLL_LOOP
-      for (int k = 0; k != 2; k++)
-        zip32[i*4+j*2+k] = vzipq_u32(vreinterpretq_u32_u16(zip16[i*4+j].val[k]),
-            vreinterpretq_u32_u16(zip16[i*4+j+2].val[k]));
-    }
-  }
-
-  EIGEN_UNROLL_LOOP
-  for (int i = 0; i != 4; i++)
-  {
-    EIGEN_UNROLL_LOOP
-    for (int j = 0; j != 2; j++)
-    {
-      kernel.packet[i*4+j*2] = vreinterpretq_u8_u32(vcombine_u32(vget_low_u32(zip32[i].val[j]),
-          vget_low_u32(zip32[i+4].val[j])));
-      kernel.packet[i*4+j*2+1] = vreinterpretq_u8_u32(vcombine_u32(vget_high_u32(zip32[i].val[j]),
-          vget_high_u32(zip32[i+4].val[j])));
+      for (int k=0; k<n; ++k) {
+        const int idx = 2*j*n+k;
+        zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
+      }
     }
   }
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel)
-{
-  const int16x4x2_t zip16_1 = vzip_s16(kernel.packet[0], kernel.packet[1]);
-  const int16x4x2_t zip16_2 = vzip_s16(kernel.packet[2], kernel.packet[3]);
 
-  const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[0]), vreinterpret_u32_s16(zip16_2.val[0]));
-  const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_s16(zip16_1.val[1]), vreinterpret_u32_s16(zip16_2.val[1]));
+} // namespace detail
 
-  kernel.packet[0] = vreinterpret_s16_u32(zip32_1.val[0]);
-  kernel.packet[1] = vreinterpret_s16_u32(zip32_1.val[1]);
-  kernel.packet[2] = vreinterpret_s16_u32(zip32_2.val[0]);
-  kernel.packet[3] = vreinterpret_s16_u32(zip32_2.val[1]);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
 }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel)
-{
-  const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]);
-  const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]);
-
-  const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0]));
-  const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1]));
-
-  kernel.packet[0] = vreinterpretq_s16_u32(zip32_1.val[0]);
-  kernel.packet[1] = vreinterpretq_s16_u32(zip32_1.val[1]);
-  kernel.packet[2] = vreinterpretq_s16_u32(zip32_2.val[0]);
-  kernel.packet[3] = vreinterpretq_s16_u32(zip32_2.val[1]);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel)
 {
-  const int8x16x2_t zip8_1 = vzipq_s8(kernel.packet[0], kernel.packet[1]);
-  const int8x16x2_t zip8_2 = vzipq_s8(kernel.packet[2], kernel.packet[3]);
+  const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
+  const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
 
-  const int16x8x2_t zip16_1 = vzipq_s16(vreinterpretq_s16_s8(zip8_1.val[0]), vreinterpretq_s16_s8(zip8_2.val[0]));
-  const int16x8x2_t zip16_2 = vzipq_s16(vreinterpretq_s16_s8(zip8_1.val[1]), vreinterpretq_s16_s8(zip8_2.val[1]));
+  const int8x8x2_t zip8 = vzip_s8(a,b);
+  const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
 
-  kernel.packet[0] = vreinterpretq_s8_s16(zip16_1.val[0]);
-  kernel.packet[1] = vreinterpretq_s8_s16(zip16_1.val[1]);
-  kernel.packet[2] = vreinterpretq_s8_s16(zip16_2.val[0]);
-  kernel.packet[3] = vreinterpretq_s8_s16(zip16_2.val[1]);
+  kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
 }
-
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel)
-{
-  const uint8x16x2_t zip8_1 = vzipq_u8(kernel.packet[0], kernel.packet[1]);
-  const uint8x16x2_t zip8_2 = vzipq_u8(kernel.packet[2], kernel.packet[3]);
-
-  const uint16x8x2_t zip16_1 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[0]), vreinterpretq_u16_u8(zip8_2.val[0]));
-  const uint16x8x2_t zip16_2 = vzipq_u16(vreinterpretq_u16_u8(zip8_1.val[1]), vreinterpretq_u16_u8(zip8_2.val[1]));
-
-  kernel.packet[0] = vreinterpretq_u8_u16(zip16_1.val[0]);
-  kernel.packet[1] = vreinterpretq_u8_u16(zip16_1.val[1]);
-  kernel.packet[2] = vreinterpretq_u8_u16(zip16_2.val[0]);
-  kernel.packet[3] = vreinterpretq_u8_u16(zip16_2.val[1]);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
 }
 
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel)
 {
-  const int16x8x2_t zip16_1 = vzipq_s16(kernel.packet[0], kernel.packet[1]);
-  const int16x8x2_t zip16_2 = vzipq_s16(kernel.packet[2], kernel.packet[3]);
-  const int16x8x2_t zip16_3 = vzipq_s16(kernel.packet[4], kernel.packet[5]);
-  const int16x8x2_t zip16_4 = vzipq_s16(kernel.packet[6], kernel.packet[7]);
+  const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
+  const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
 
-  const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[0]), vreinterpretq_u32_s16(zip16_2.val[0]));
-  const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_s16(zip16_1.val[1]), vreinterpretq_u32_s16(zip16_2.val[1]));
-  const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[0]), vreinterpretq_u32_s16(zip16_4.val[0]));
-  const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_s16(zip16_3.val[1]), vreinterpretq_u32_s16(zip16_4.val[1]));
+  const uint8x8x2_t zip8 = vzip_u8(a,b);
+  const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
 
-  kernel.packet[0] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0])));
-  kernel.packet[1] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0])));
-  kernel.packet[2] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1])));
-  kernel.packet[3] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1])));
-  kernel.packet[4] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0])));
-  kernel.packet[5] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0])));
-  kernel.packet[6] = vreinterpretq_s16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1])));
-  kernel.packet[7] = vreinterpretq_s16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1])));
+  kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel)
-{
-  const uint16x4x2_t zip16_1 = vzip_u16(kernel.packet[0], kernel.packet[1]);
-  const uint16x4x2_t zip16_2 = vzip_u16(kernel.packet[2], kernel.packet[3]);
-
-  const uint32x2x2_t zip32_1 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[0]), vreinterpret_u32_u16(zip16_2.val[0]));
-  const uint32x2x2_t zip32_2 = vzip_u32(vreinterpret_u32_u16(zip16_1.val[1]), vreinterpret_u32_u16(zip16_2.val[1]));
-
-  kernel.packet[0] = vreinterpret_u16_u32(zip32_1.val[0]);
-  kernel.packet[1] = vreinterpret_u16_u32(zip32_1.val[1]);
-  kernel.packet[2] = vreinterpret_u16_u32(zip32_2.val[0]);
-  kernel.packet[3] = vreinterpret_u16_u32(zip32_2.val[1]);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel)
-{
-  const uint16x8x2_t zip16_1 = vzipq_u16(kernel.packet[0], kernel.packet[1]);
-  const uint16x8x2_t zip16_2 = vzipq_u16(kernel.packet[2], kernel.packet[3]);
-  const uint16x8x2_t zip16_3 = vzipq_u16(kernel.packet[4], kernel.packet[5]);
-  const uint16x8x2_t zip16_4 = vzipq_u16(kernel.packet[6], kernel.packet[7]);
-
-  const uint32x4x2_t zip32_1 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[0]), vreinterpretq_u32_u16(zip16_2.val[0]));
-  const uint32x4x2_t zip32_2 = vzipq_u32(vreinterpretq_u32_u16(zip16_1.val[1]), vreinterpretq_u32_u16(zip16_2.val[1]));
-  const uint32x4x2_t zip32_3 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[0]), vreinterpretq_u32_u16(zip16_4.val[0]));
-  const uint32x4x2_t zip32_4 = vzipq_u32(vreinterpretq_u32_u16(zip16_3.val[1]), vreinterpretq_u32_u16(zip16_4.val[1]));
 
-  kernel.packet[0] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[0]), vget_low_u32(zip32_3.val[0])));
-  kernel.packet[1] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[0]), vget_high_u32(zip32_3.val[0])));
-  kernel.packet[2] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_1.val[1]), vget_low_u32(zip32_3.val[1])));
-  kernel.packet[3] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_1.val[1]), vget_high_u32(zip32_3.val[1])));
-  kernel.packet[4] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[0]), vget_low_u32(zip32_4.val[0])));
-  kernel.packet[5] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[0]), vget_high_u32(zip32_4.val[0])));
-  kernel.packet[6] = vreinterpretq_u16_u32(vcombine_u32(vget_low_u32(zip32_2.val[1]), vget_low_u32(zip32_4.val[1])));
-  kernel.packet[7] = vreinterpretq_u16_u32(vcombine_u32(vget_high_u32(zip32_2.val[1]), vget_high_u32(zip32_4.val[1])));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel)
-{
-  const int32x2x2_t z = vzip_s32(kernel.packet[0], kernel.packet[1]);
-  kernel.packet[0] = z.val[0];
-  kernel.packet[1] = z.val[1];
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel)
-{
-  const int32x4x2_t tmp1 = vzipq_s32(kernel.packet[0], kernel.packet[1]);
-  const int32x4x2_t tmp2 = vzipq_s32(kernel.packet[2], kernel.packet[3]);
 
-  kernel.packet[0] = vcombine_s32(vget_low_s32(tmp1.val[0]), vget_low_s32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_s32(vget_high_s32(tmp1.val[0]), vget_high_s32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_s32(vget_low_s32(tmp1.val[1]), vget_low_s32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_s32(vget_high_s32(tmp1.val[1]), vget_high_s32(tmp2.val[1]));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel)
-{
-  const uint32x2x2_t z = vzip_u32(kernel.packet[0], kernel.packet[1]);
-  kernel.packet[0] = z.val[0];
-  kernel.packet[1] = z.val[1];
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
 }
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel)
-{
-  const uint32x4x2_t tmp1 = vzipq_u32(kernel.packet[0], kernel.packet[1]);
-  const uint32x4x2_t tmp2 = vzipq_u32(kernel.packet[2], kernel.packet[3]);
 
-  kernel.packet[0] = vcombine_u32(vget_low_u32(tmp1.val[0]), vget_low_u32(tmp2.val[0]));
-  kernel.packet[1] = vcombine_u32(vget_high_u32(tmp1.val[0]), vget_high_u32(tmp2.val[0]));
-  kernel.packet[2] = vcombine_u32(vget_low_u32(tmp1.val[1]), vget_low_u32(tmp2.val[1]));
-  kernel.packet[3] = vcombine_u32(vget_high_u32(tmp1.val[1]), vget_high_u32(tmp2.val[1]));
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+    detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
+  detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
 }
+
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
 ptranspose(PacketBlock<Packet2l, 2>& kernel)
 {
 #if EIGEN_ARCH_ARM64
   const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
-  const int64x2_t tmp2 = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
-
+  kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
   kernel.packet[0] = tmp1;
-  kernel.packet[1] = tmp2;
 #else
   const int64x1_t tmp[2][2] = {
     { vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0]) },
@@ -3135,10 +3054,8 @@ ptranspose(PacketBlock<Packet2ul, 2>& kernel)
 {
 #if EIGEN_ARCH_ARM64
   const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
-  const uint64x2_t tmp2 = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
-
+  kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
   kernel.packet[0] = tmp1;
-  kernel.packet[1] = tmp2;
 #else
   const uint64x1_t tmp[2][2] = {
     { vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0]) },
@@ -3468,11 +3385,20 @@ template<> struct unpacket_traits<Packet4bf>
   };
 };
 
+namespace detail {  
+template<>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+} // namespace detail
+
 EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
 {
   // See the scalar implemention in BFloat16.h for a comprehensible explanation
   // of this fast rounding algorithm
-  Packet4ui input = reinterpret_cast<Packet4ui>(p);
+  Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
 
   // lsb = (input >> 16) & 1
   Packet4ui lsb =  vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
@@ -3497,7 +3423,7 @@ EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p)
 
 EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p)
 {
-  return reinterpret_cast<Packet4f>(vshlq_n_u32(vmovl_u16(p), 16));
+  return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
 }
 
 EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
@@ -3505,21 +3431,21 @@ EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) {
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
-  return pset1<Packet4us>(from.value);
+  return Packet4bf(pset1<Packet4us>(from.value));
 }
 
 template<> EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
-  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(from)));
+  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from)
 {
-  return pload<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(pload<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from)
 {
-  return ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
 template<> EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from)
@@ -3534,7 +3460,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet
 
 template<> EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from)
 {
-  return ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from));
+  return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
 }
 
 template <> EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
@@ -3581,25 +3507,25 @@ template<> EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a)
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a,const Packet4bf& b) {
-  return por<Packet4us>(a, b);
+  return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a,const Packet4bf& b) {
-  return pxor<Packet4us>(a, b);
+  return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a,const Packet4bf& b) {
-  return pand<Packet4us>(a, b);
+  return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a,const Packet4bf& b) {
-  return pandnot<Packet4us>(a, b);
+  return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a,
                                                       const Packet4bf& b)
 {
-  return pselect<Packet4us>(mask, a, b);
+  return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a)
@@ -3638,13 +3564,13 @@ template<> EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, con
 template<>
 EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride)
 {
-  return pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride);
+  return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
 }
 
 template<>
 EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride)
 {
-  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), from, stride);
+  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
 }
 
 template<> EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a)
@@ -3669,21 +3595,12 @@ template<> EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a
 
 template<> EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a)
 {
-  return preverse<Packet4us>(a);
+  return Packet4bf(preverse<Packet4us>(Packet4us(a)));
 }
 
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel)
 {
-  PacketBlock<Packet4us, 4> k;
-  k.packet[0] = kernel.packet[0];
-  k.packet[1] = kernel.packet[1];
-  k.packet[2] = kernel.packet[2];
-  k.packet[3] = kernel.packet[3];
-  ptranspose(k);
-  kernel.packet[0] = k.packet[0];
-  kernel.packet[1] = k.packet[1];
-  kernel.packet[2] = k.packet[2];
-  kernel.packet[3] = k.packet[3];
+  detail::ptranspose_impl(kernel);
 }
 
 template<> EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
@@ -3701,6 +3618,11 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a,
   return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
 }
 
+template<> EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
+{
+  return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
 template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b)
 {
   return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
@@ -3708,7 +3630,7 @@ template<> EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a,
 
 template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
 {
-  return pxor<Packet4us>(a, pset1<Packet4us>(static_cast<uint16_t>(0x8000)));
+  return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
 }
 
 //---------- double ----------
@@ -3726,16 +3648,29 @@ template<> EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a)
 
 #if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
+#if EIGEN_COMP_GNUC
 // Bug 907: workaround missing declarations of the following two functions in the ADK
 // Defining these functions as templates ensures that if these intrinsics are
 // already defined in arm_neon.h, then our workaround doesn't cause a conflict
 // and has lower priority in overload resolution.
+// This doesn't work with MSVC though, since the function names are macros.
 template <typename T> uint64x2_t vreinterpretq_u64_f64(T a) { return (uint64x2_t) a; }
 
 template <typename T> float64x2_t vreinterpretq_f64_u64(T a) { return (float64x2_t) a; }
+#endif
 
+#if EIGEN_COMP_MSVC_STRICT
+typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
+typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
+#else
 typedef float64x2_t Packet2d;
 typedef float64x1_t Packet1d;
+#endif
+
+EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
+  double from[2] = {a, b};
+  return vld1q_f64(from);
+}
 
 // fuctionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
 // Currently used in LU/arch/InverseSize4.h to enable a shared implementation
@@ -3744,7 +3679,7 @@ EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int m
 {
   const double* a = reinterpret_cast<const double*>(&m);
   const double* b = reinterpret_cast<const double*>(&n);
-  Packet2d res = {*(a + (mask & 1)), *(b + ((mask >> 1) & 1))};
+  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
   return res;
 }
 
@@ -3761,7 +3696,7 @@ EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a,const Packet2d& b)
   return shuffle(a, b, 3);
 }
 #define vec2d_duplane(a, p) \
-  vdupq_laneq_f64(a, p)
+  Packet2d(vdupq_laneq_f64(a, p))
 
 template<> struct packet_traits<double>  : default_packet_traits
 {
@@ -3835,7 +3770,7 @@ template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const
 
 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& , const Packet2d& );
 template<> EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b){
-  const Packet2d mask = {-0.0,0.0};
+  const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
   return padd(a, pxor(mask, b));
 }
 
@@ -3950,7 +3885,7 @@ template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
 { return (vget_low_f64(a) * vget_high_f64(a))[0]; }
 #else
 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{ return vget_lane_f64(vget_low_f64(a) * vget_high_f64(a), 0); }
+{ return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0); }
 #endif
 
 // min
@@ -4006,7 +3941,7 @@ template<> EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
 
 template<> EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x){ return vsqrtq_f64(_x); }
 
-#endif // EIGEN_ARCH_ARM64
+#endif // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
 
 // Do we have an fp16 types and supporting Neon intrinsics?
 #if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
diff --git a/Eigen/src/Core/arch/NEON/TypeCasting.h b/Eigen/src/Core/arch/NEON/TypeCasting.h
index 54f97336e03414097ebc4e5eaaa0e5545fed7276..c546466a137b003902070aa83ee1384e9bc7fb54 100644
--- a/Eigen/src/Core/arch/NEON/TypeCasting.h
+++ b/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -15,6 +15,113 @@ namespace Eigen {
 
 namespace internal {
 
+//==============================================================================
+// preinterpret
+//==============================================================================
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
+  return Packet2f(vreinterpret_f32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
+  return Packet2f(vreinterpret_f32_u32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return Packet4f(vreinterpretq_f32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
+  return Packet4f(vreinterpretq_f32_u32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
+  return static_cast<Packet4c>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
+  return Packet8c(vreinterpret_s8_u8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
+  return Packet16c(vreinterpretq_s8_u8(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
+  return static_cast<Packet4uc>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
+  return Packet8uc(vreinterpret_u8_s8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
+  return Packet16uc(vreinterpretq_u8_s8(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
+  return Packet4s(vreinterpret_s16_u16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
+  return Packet8s(vreinterpretq_s16_u16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
+  return Packet4us(vreinterpret_u16_s16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
+  return Packet8us(vreinterpretq_u16_s16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
+  return Packet2i(vreinterpret_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
+  return Packet2i(vreinterpret_s32_u32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return Packet4i(vreinterpretq_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+  return Packet4i(vreinterpretq_s32_u32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
+  return Packet2ui(vreinterpret_u32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
+  return Packet2ui(vreinterpret_u32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
+  return Packet4ui(vreinterpretq_u32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+  return Packet4ui(vreinterpretq_u32_s32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
+  return Packet2l(vreinterpretq_s64_u64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
+  return Packet2ul(vreinterpretq_u64_s64(a));
+}
+
 //==============================================================================
 // pcast, SrcType = float
 //==============================================================================
@@ -188,7 +295,7 @@ struct type_casting_traits<numext::int8_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
-  return vreinterpretq_u64_s64(pcast<Packet16c, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet16c, Packet2l>(a));
 }
 
 template <>
@@ -212,11 +319,11 @@ struct type_casting_traits<numext::int8_t, numext::uint32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
-  return vreinterpretq_u32_s32(pcast<Packet16c, Packet4i>(a));
+  return preinterpret<Packet4ui>(pcast<Packet16c, Packet4i>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet8c, Packet2ui>(const Packet8c& a) {
-  return vreinterpret_u32_s32(pcast<Packet8c, Packet2i>(a));
+  return preinterpret<Packet2ui>(pcast<Packet8c, Packet2i>(a));
 }
 
 template <>
@@ -240,11 +347,11 @@ struct type_casting_traits<numext::int8_t, numext::uint16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
-  return vreinterpretq_u16_s16(pcast<Packet16c, Packet8s>(a));
+  return preinterpret<Packet8us>(pcast<Packet16c, Packet8s>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pcast<Packet8c, Packet4us>(const Packet8c& a) {
-  return vreinterpret_u16_s16(pcast<Packet8c, Packet4s>(a));
+  return preinterpret<Packet4us>(pcast<Packet8c, Packet4s>(a));
 }
 
 template <>
@@ -270,11 +377,11 @@ struct type_casting_traits<numext::int8_t, numext::uint8_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet16uc pcast<Packet16c, Packet16uc>(const Packet16c& a) {
-  return vreinterpretq_u8_s8(a);
+  return preinterpret<Packet16uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8uc pcast<Packet8c, Packet8uc>(const Packet8c& a) {
-  return vreinterpret_u8_s8(a);
+  return preinterpret<Packet8uc>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4uc pcast<Packet4c, Packet4uc>(const Packet4c& a) {
@@ -315,7 +422,7 @@ struct type_casting_traits<numext::uint8_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
-  return vreinterpretq_s64_u64(pcast<Packet16uc, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet16uc, Packet2ul>(a));
 }
 
 template <>
@@ -339,11 +446,11 @@ struct type_casting_traits<numext::uint8_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
-  return vreinterpretq_s32_u32(pcast<Packet16uc, Packet4ui>(a));
+  return preinterpret<Packet4i>(pcast<Packet16uc, Packet4ui>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet8uc, Packet2i>(const Packet8uc& a) {
-  return vreinterpret_s32_u32(pcast<Packet8uc, Packet2ui>(a));
+  return preinterpret<Packet2i>(pcast<Packet8uc, Packet2ui>(a));
 }
 
 template <>
@@ -367,11 +474,11 @@ struct type_casting_traits<numext::uint8_t, numext::int16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
-  return vreinterpretq_s16_u16(pcast<Packet16uc, Packet8us>(a));
+  return preinterpret<Packet8s>(pcast<Packet16uc, Packet8us>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet8uc, Packet4s>(const Packet8uc& a) {
-  return vreinterpret_s16_u16(pcast<Packet8uc, Packet4us>(a));
+  return preinterpret<Packet4s>(pcast<Packet8uc, Packet4us>(a));
 }
 
 template <>
@@ -397,11 +504,11 @@ struct type_casting_traits<numext::uint8_t, numext::int8_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet16uc, Packet16c>(const Packet16uc& a) {
-  return vreinterpretq_s8_u8(a);
+  return preinterpret<Packet16c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet8uc, Packet8c>(const Packet8uc& a) {
-  return vreinterpret_s8_u8(a);
+  return preinterpret<Packet8c>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4c pcast<Packet4uc, Packet4c>(const Packet4uc& a) {
@@ -442,7 +549,7 @@ struct type_casting_traits<numext::int16_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
-  return vreinterpretq_u64_s64(pcast<Packet8s, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet8s, Packet2l>(a));
 }
 
 template <>
@@ -466,11 +573,11 @@ struct type_casting_traits<numext::int16_t, numext::uint32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
-  return vreinterpretq_u32_s32(pcast<Packet8s, Packet4i>(a));
+  return preinterpret<Packet4ui>(pcast<Packet8s, Packet4i>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet4s, Packet2ui>(const Packet4s& a) {
-  return vreinterpret_u32_s32(pcast<Packet4s, Packet2i>(a));
+  return preinterpret<Packet2ui>(pcast<Packet4s, Packet2i>(a));
 }
 
 template <>
@@ -492,11 +599,11 @@ struct type_casting_traits<numext::int16_t, numext::uint16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8us pcast<Packet8s, Packet8us>(const Packet8s& a) {
-  return vreinterpretq_u16_s16(a);
+  return preinterpret<Packet8us>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4us pcast<Packet4s, Packet4us>(const Packet4s& a) {
-  return vreinterpret_u16_s16(a);
+  return preinterpret<Packet4us>(a);
 }
 
 template <>
@@ -559,7 +666,7 @@ struct type_casting_traits<numext::uint16_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
-  return vreinterpretq_s64_u64(pcast<Packet8us, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet8us, Packet2ul>(a));
 }
 
 template <>
@@ -583,11 +690,11 @@ struct type_casting_traits<numext::uint16_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
-  return vreinterpretq_s32_u32(pcast<Packet8us, Packet4ui>(a));
+  return preinterpret<Packet4i>(pcast<Packet8us, Packet4ui>(a));
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet4us, Packet2i>(const Packet4us& a) {
-  return vreinterpret_s32_u32(pcast<Packet4us, Packet2ui>(a));
+  return preinterpret<Packet2i>(pcast<Packet4us, Packet2ui>(a));
 }
 
 template <>
@@ -609,11 +716,11 @@ struct type_casting_traits<numext::uint16_t, numext::int16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet8us, Packet8s>(const Packet8us& a) {
-  return vreinterpretq_s16_u16(a);
+  return preinterpret<Packet8s>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet4us, Packet4s>(const Packet4us& a) {
-  return vreinterpret_s16_u16(a);
+  return preinterpret<Packet4s>(a);
 }
 
 template <>
@@ -635,11 +742,11 @@ struct type_casting_traits<numext::uint16_t, numext::int8_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
-  return vreinterpretq_s8_u8(pcast<Packet8us, Packet16uc>(a, b));
+  return preinterpret<Packet16c>(pcast<Packet8us, Packet16uc>(a, b));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet4us, Packet8c>(const Packet4us& a, const Packet4us& b) {
-  return vreinterpret_s8_u8(pcast<Packet4us, Packet8uc>(a, b));
+  return preinterpret<Packet8c>(pcast<Packet4us, Packet8uc>(a, b));
 }
 
 //==============================================================================
@@ -674,7 +781,7 @@ struct type_casting_traits<numext::int32_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
-  return vreinterpretq_u64_s64(pcast<Packet4i, Packet2l>(a));
+  return preinterpret<Packet2ul>(pcast<Packet4i, Packet2l>(a));
 }
 
 template <>
@@ -696,11 +803,11 @@ struct type_casting_traits<numext::int32_t, numext::uint32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4ui pcast<Packet4i, Packet4ui>(const Packet4i& a) {
-  return vreinterpretq_u32_s32(a);
+  return preinterpret<Packet4ui>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2ui pcast<Packet2i, Packet2ui>(const Packet2i& a) {
-  return vreinterpret_u32_s32(a);
+  return preinterpret<Packet2ui>(a);
 }
 
 template <>
@@ -799,7 +906,7 @@ struct type_casting_traits<numext::uint32_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
-  return vreinterpretq_s64_u64(pcast<Packet4ui, Packet2ul>(a));
+  return preinterpret<Packet2l>(pcast<Packet4ui, Packet2ul>(a));
 }
 
 template <>
@@ -821,11 +928,11 @@ struct type_casting_traits<numext::uint32_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet4ui, Packet4i>(const Packet4ui& a) {
-  return vreinterpretq_s32_u32(a);
+  return preinterpret<Packet4i>(a);
 }
 template <>
 EIGEN_STRONG_INLINE Packet2i pcast<Packet2ui, Packet2i>(const Packet2ui& a) {
-  return vreinterpret_s32_u32(a);
+  return preinterpret<Packet2i>(a);
 }
 
 template <>
@@ -847,11 +954,11 @@ struct type_casting_traits<numext::uint32_t, numext::int16_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
-  return vreinterpretq_s16_u16(pcast<Packet4ui, Packet8us>(a, b));
+  return preinterpret<Packet8s>(pcast<Packet4ui, Packet8us>(a, b));
 }
 template <>
 EIGEN_STRONG_INLINE Packet4s pcast<Packet2ui, Packet4s>(const Packet2ui& a, const Packet2ui& b) {
-  return vreinterpret_s16_u16(pcast<Packet2ui, Packet4us>(a, b));
+  return preinterpret<Packet4s>(pcast<Packet2ui, Packet4us>(a, b));
 }
 
 template <>
@@ -880,12 +987,12 @@ struct type_casting_traits<numext::uint32_t, numext::int8_t> {
 template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
                                                           const Packet4ui& d) {
-  return vreinterpretq_s8_u8(pcast<Packet4ui, Packet16uc>(a, b, c, d));
+  return preinterpret<Packet16c>(pcast<Packet4ui, Packet16uc>(a, b, c, d));
 }
 template <>
 EIGEN_STRONG_INLINE Packet8c pcast<Packet2ui, Packet8c>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
                                                         const Packet2ui& d) {
-  return vreinterpret_s8_u8(pcast<Packet2ui, Packet8uc>(a, b, c, d));
+  return preinterpret<Packet8c>(pcast<Packet2ui, Packet8uc>(a, b, c, d));
 }
 
 //==============================================================================
@@ -915,7 +1022,7 @@ struct type_casting_traits<numext::int64_t, numext::uint64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2ul pcast<Packet2l, Packet2ul>(const Packet2l& a) {
-  return vreinterpretq_u64_s64(a);
+  return preinterpret<Packet2ul>(a);
 }
 
 template <>
@@ -1013,7 +1120,7 @@ struct type_casting_traits<numext::uint64_t, numext::int64_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet2l pcast<Packet2ul, Packet2l>(const Packet2ul& a) {
-  return vreinterpretq_s64_u64(a);
+  return preinterpret<Packet2l>(a);
 }
 
 template <>
@@ -1031,7 +1138,7 @@ struct type_casting_traits<numext::uint64_t, numext::int32_t> {
 };
 template <>
 EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
-  return vreinterpretq_s32_u32(pcast<Packet2ul, Packet4ui>(a, b));
+  return preinterpret<Packet4i>(pcast<Packet2ul, Packet4ui>(a, b));
 }
 
 template <>
@@ -1053,7 +1160,7 @@ struct type_casting_traits<numext::uint64_t, numext::int16_t> {
 template <>
 EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                         const Packet2ul& d) {
-  return vreinterpretq_s16_u16(pcast<Packet2ul, Packet8us>(a, b, c, d));
+  return preinterpret<Packet8s>(pcast<Packet2ul, Packet8us>(a, b, c, d));
 }
 
 template <>
@@ -1077,122 +1184,40 @@ template <>
 EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
                                                           const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
                                                           const Packet2ul& g, const Packet2ul& h) {
-  return vreinterpretq_s8_u8(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
+  return preinterpret<Packet16c>(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
 }
 
+#if EIGEN_ARCH_ARM64
+
 //==============================================================================
-// preinterpret
+// pcast/preinterpret, Double
 //==============================================================================
-template <>
-EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
-  return vreinterpret_f32_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
-  return vreinterpret_f32_u32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
-  return vreinterpretq_f32_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
-  return vreinterpretq_f32_u32(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
-  return static_cast<Packet4c>(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
-  return vreinterpret_s8_u8(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
-  return vreinterpretq_s8_u8(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
-  return static_cast<Packet4uc>(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
-  return vreinterpret_u8_s8(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
-  return vreinterpretq_u8_s8(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
-  return vreinterpret_s16_u16(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
-  return vreinterpretq_s16_u16(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
-  return vreinterpret_u16_s16(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
-  return vreinterpretq_u16_s16(a);
-}
-
-template <>
-EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
-  return vreinterpret_s32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
-  return vreinterpret_s32_u32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
-  return vreinterpretq_s32_f32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
-  return vreinterpretq_s32_u32(a);
-}
 
 template <>
-EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
-  return vreinterpret_u32_f32(a);
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return Packet2d(vreinterpretq_f64_s64(a));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
-  return vreinterpret_u32_s32(a);
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
+  return Packet2d(vreinterpretq_f64_u64(a));
 }
 template <>
-EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
-  return vreinterpretq_u32_f32(a);
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return Packet2l(vreinterpretq_s64_f64(a));
 }
 template <>
-EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
-  return vreinterpretq_u32_s32(a);
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
+  return Packet2ul(vreinterpretq_u64_f64(a));
 }
-
 template <>
-EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
-  return vreinterpretq_s64_u64(a);
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return Packet2d(vreinterpretq_f64_s32(a));
 }
 template <>
-EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
-  return vreinterpretq_u64_s64(a);
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return Packet4i(vreinterpretq_s32_f64(a));
 }
 
-#if EIGEN_ARCH_ARM64
-
-//==============================================================================
-// pcast/preinterpret, Double
-//==============================================================================
-
 template <>
 struct type_casting_traits<double, double> {
   enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
@@ -1314,7 +1339,9 @@ struct type_casting_traits<numext::int8_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
   // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet8c, Packet2f>(vget_low_s8(a)));
+  // MSVC defines most intrinsics as macros, so we need to do this in two lines for portability.
+  Packet2f tmp = pcast<Packet8c, Packet2f>(vget_low_s8(a));
+  return vcvt_f64_f32(tmp);
 }
 
 template <>
@@ -1324,7 +1351,8 @@ struct type_casting_traits<numext::uint8_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
   // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet8uc, Packet2f>(vget_low_u8(a)));
+  Packet2f tmp = pcast<Packet8uc, Packet2f>(vget_low_u8(a));
+  return vcvt_f64_f32(tmp);
 }
 
 template <>
@@ -1334,7 +1362,8 @@ struct type_casting_traits<numext::int16_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
   // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet4s, Packet2f>(vget_low_s16(a)));
+  Packet2f tmp = pcast<Packet4s, Packet2f>(vget_low_s16(a));
+  return vcvt_f64_f32(tmp);
 }
 
 template <>
@@ -1344,7 +1373,8 @@ struct type_casting_traits<numext::uint16_t, double> {
 template <>
 EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
   // Discard all but first two values.
-  return vcvt_f64_f32(pcast<Packet4us, Packet2f>(vget_low_u16(a)));
+  Packet2f tmp = pcast<Packet4us, Packet2f>(vget_low_u16(a));
+  return vcvt_f64_f32(tmp);
 }
 
 template <>
@@ -1385,31 +1415,6 @@ EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul, Packet2d>(const Packet2ul& a) {
   return vcvtq_f64_u64(a);
 }
 
-template <>
-EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
-  return vreinterpretq_f64_s64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
-  return vreinterpretq_f64_u64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
-  return vreinterpretq_s64_f64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
-  return vreinterpretq_u64_f64(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
-  return vreinterpretq_f64_s32(a);
-}
-template <>
-EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
-  return vreinterpretq_s32_f64(a);
-}
-
 #endif  // EIGEN_ARCH_ARM64
 
 }  // end namespace internal
diff --git a/Eigen/src/Core/arch/SSE/Complex.h b/Eigen/src/Core/arch/SSE/Complex.h
index b1edfa4b23f3acc33c968295aaddd7daa07c9554..215bfd7bb6a294690e9e1a9e640d47c7126619a3 100644
--- a/Eigen/src/Core/arch/SSE/Complex.h
+++ b/Eigen/src/Core/arch/SSE/Complex.h
@@ -19,7 +19,7 @@ struct Packet2cf
 {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
-  __m128  v;
+  Packet4f v;
 };
 
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
@@ -66,12 +66,6 @@ template<> struct unpacket_traits<Packet2cf> {
 
 template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
 template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b);
-template<> EIGEN_STRONG_INLINE Packet2cf paddsub<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x0,0x0));
-  return Packet2cf(padd(a.v, pxor(mask, b.v)));
-}
 
 template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
 {
@@ -112,14 +106,9 @@ template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<fl
 
 template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
 {
-  Packet2cf res;
-#ifdef EIGEN_VECTORIZE_SSE3
-  res.v = _mm_castpd_ps(_mm_loaddup_pd(reinterpret_cast<double const*>(&from)));
-#else
-  res.v = _mm_castpd_ps(_mm_load_sd(reinterpret_cast<double const*>(&from)));
-  res.v = _mm_movelh_ps(res.v, res.v);
-#endif
-  return res;
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet2cf(_mm_set_ps(im, re, im, re));
 }
 
 template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
@@ -171,74 +160,21 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
 }
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(a, pconj(b));
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
-                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(pconj(a), b);
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-                                _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                                      vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
 {
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return pconj(internal::pmul(a, b));
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
-                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-    #endif
-  }
-};
+  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
+}
 
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for SSE3 and 4
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
+  Packet2cf res = pmul(a, pconj(b));
   __m128 s = _mm_mul_ps(b.v,b.v);
-  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
+  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,vec4f_swizzle1(s, 1, 0, 3, 2))));
 }
 
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/* <Packet2cf> */(const Packet2cf& x)
-{
-  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
-}
 
 
 //---------- double ----------
@@ -246,7 +182,7 @@ struct Packet1cd
 {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
-  __m128d  v;
+  Packet2d v;
 };
 
 // Use the packet_traits defined in AVX/PacketMath.h instead if we're going
@@ -354,66 +290,12 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const
   return pfirst(a);
 }
 
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(a, pconj(b));
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
-                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                           vec2d_swizzle1(b.v, 1, 0))));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(pconj(a), b);
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
-                                _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                                      vec2d_swizzle1(b.v, 1, 0)), mask)));
-    #endif
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return pconj(internal::pmul(a, b));
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
-                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                           vec2d_swizzle1(b.v, 1, 0))));
-    #endif
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for SSE3 and 4
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet1cd res = pmul(a,pconj(b));
   __m128d s = _mm_mul_pd(b.v,b.v);
   return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
 }
diff --git a/Eigen/src/Core/arch/SSE/PacketMath.h b/Eigen/src/Core/arch/SSE/PacketMath.h
index db102c73a2376887332d2b25f6894bfa86284653..b485e0df12cfa164b254bd9953702edf79915908 100755
--- a/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -444,7 +444,7 @@ template<> EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packe
 template<> EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) { return por(pcmp_lt(a,b), pcmp_eq(a,b)); }
 
 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
   // There appears to be a bug in GCC, by which the optimizer may
   // flip the argument order in calls to _mm_min_ps, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -463,7 +463,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
   // There appears to be a bug in GCC, by which the optimizer may
   // flip the argument order in calls to _mm_min_pd, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -494,7 +494,7 @@ template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const
 
 
 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
   // There appears to be a bug in GCC, by which the optimizer may
   // flip the argument order in calls to _mm_max_ps, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
@@ -513,7 +513,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const
 #endif
 }
 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
-#if EIGEN_COMP_GNUC && EIGEN_COMP_GNUC < 63
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 63
   // There appears to be a bug in GCC, by which the optimizer may
   // flip the argument order in calls to _mm_max_pd, so we have to
   // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
diff --git a/Eigen/src/Core/arch/SVE/PacketMath.h b/Eigen/src/Core/arch/SVE/PacketMath.h
index 4877b6d8090cdd0ae4ebf8260c3d50c1242aab70..9060b372ff57ec80bce35b9a763715a1ba79b60a 100644
--- a/Eigen/src/Core/arch/SVE/PacketMath.h
+++ b/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -22,10 +22,6 @@ namespace internal
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 
 template <typename Scalar, int SVEVectorLength>
diff --git a/Eigen/src/Core/arch/ZVector/Complex.h b/Eigen/src/Core/arch/ZVector/Complex.h
index ddf5a97d86b938aff973c0e5db1a23d51c7c6206..6c67cfe058428be532b2ef2c34195235865f58eb 100644
--- a/Eigen/src/Core/arch/ZVector/Complex.h
+++ b/Eigen/src/Core/arch/ZVector/Complex.h
@@ -91,8 +91,18 @@ template<> struct packet_traits<std::complex<double> >  : default_packet_traits
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float>  type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet2cf half; };
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet1cd half; };
+template<> struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float>  type;
+  enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+};
+template<> struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum {size=1, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false};
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+};
 
 /* Forward declaration */
 EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf,2>& kernel);
@@ -150,7 +160,7 @@ template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::c
 
 template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
 {
-  std::complex<double> EIGEN_ALIGN16 res;
+  EIGEN_ALIGN16 std::complex<double> res;
   pstore<std::complex<double> >(&res, a);
 
   return res;
@@ -165,45 +175,12 @@ template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const
 {
   return pfirst(a);
 }
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d)
 
 template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
 {
   // TODO optimize it for AltiVec
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
+  Packet1cd res = pmul(a,pconj(b));
   Packet2d s = vec_madd(b.v, b.v, p2d_ZERO_);
   return Packet1cd(pdiv(res.v, s + vec_perm(s, s, p16uc_REVERSE64)));
 }
@@ -228,7 +205,7 @@ template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<f
 
 template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
 {
-  std::complex<float> EIGEN_ALIGN16 res[2];
+  EIGEN_ALIGN16 std::complex<float> res[2];
   pstore<std::complex<float> >(res, a);
 
   return res[0];
@@ -258,14 +235,14 @@ template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<flo
 
 template<> EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
   return pload<Packet2cf>(af);
 }
 template<> EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index stride)
 {
-  std::complex<float> EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 std::complex<float> af[2];
   pstore<std::complex<float> >((std::complex<float> *) af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -337,39 +314,6 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   return res;
 }
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
@@ -456,45 +400,12 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   return pfirst<Packet2cf>(prod);
 }
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
-};
-
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
-};
-
 EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f)
 
 template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
 {
   // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a, b);
+  Packet2cf res = pmul(a, pconj(b));
   Packet4f s = pmul<Packet4f>(b.v, b.v);
   return Packet2cf(pdiv(res.v, padd<Packet4f>(s, vec_perm(s, s, p16uc_COMPLEX32_REV))));
 }
diff --git a/Eigen/src/Core/arch/ZVector/PacketMath.h b/Eigen/src/Core/arch/ZVector/PacketMath.h
index b10c1f6c7e1977c339bcd2cae960077623635c6e..a7b59c80edfc5f0b468877139af36055c974b85d 100755
--- a/Eigen/src/Core/arch/ZVector/PacketMath.h
+++ b/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -22,10 +22,6 @@ namespace internal {
 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
-#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-#endif
-
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS  32
 #endif
@@ -94,8 +90,9 @@ static _EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
 static _EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
 static _EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
 
-static Packet2d p2d_ONE = { 1.0, 1.0 }; 
-static Packet2d p2d_ZERO_ = { -0.0, -0.0 };
+static Packet2d p2d_ONE = { 1.0, 1.0 };
+static Packet2d p2d_ZERO_ = { numext::bit_cast<double>(0x8000000000000000ull),
+                              numext::bit_cast<double>(0x8000000000000000ull) };
 
 #if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
@@ -361,7 +358,7 @@ pbroadcast4<Packet2d>(const double *a,
 
 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 int ai[4];
   ai[0] = from[0*stride];
   ai[1] = from[1*stride];
   ai[2] = from[2*stride];
@@ -371,7 +368,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* f
 
 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
  return pload<Packet2d>(af);
@@ -379,7 +376,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const dou
 
 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride)
 {
-  int EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 int ai[4];
   pstore<int>((int *)ai, from);
   to[0*stride] = ai[0];
   to[1*stride] = ai[1];
@@ -389,7 +386,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const
 
 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride)
 {
-  double EIGEN_ALIGN16 af[2];
+  EIGEN_ALIGN16 double af[2];
   pstore<double>(af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -463,8 +460,8 @@ template<> EIGEN_STRONG_INLINE void pstoreu<double>(double*  to, const Packet2d&
 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_ZVECTOR_PREFETCH(addr); }
 
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int    EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { EIGEN_ALIGN16 int    x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { EIGEN_ALIGN16 double x[2]; pstore(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
 {
@@ -642,7 +639,7 @@ pbroadcast4<Packet4f>(const float *a,
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 float ai[4];
   ai[0] = from[0*stride];
   ai[1] = from[1*stride];
   ai[2] = from[2*stride];
@@ -652,7 +649,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
 
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  float EIGEN_ALIGN16 ai[4];
+  EIGEN_ALIGN16 float ai[4];
   pstore<float>((float *)ai, from);
   to[0*stride] = ai[0];
   to[1*stride] = ai[1];
@@ -788,7 +785,7 @@ template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*    from)
   return p;
 }
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float  EIGEN_ALIGN16 x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
+template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[2]; vec_st2f(a.v4f[0], &x[0]); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
 {
@@ -946,7 +943,7 @@ pbroadcast4<Packet4f>(const float *a,
 
 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
+  EIGEN_ALIGN16 float af[4];
   af[0] = from[0*stride];
   af[1] = from[1*stride];
   af[2] = from[2*stride];
@@ -956,7 +953,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const floa
 
 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride)
 {
-  float EIGEN_ALIGN16 af[4];
+  EIGEN_ALIGN16 float af[4];
   pstore<float>((float*)af, from);
   to[0*stride] = af[0];
   to[1*stride] = af[1];
@@ -981,7 +978,7 @@ template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f> (const Packet4f& a) { r
 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>  (const Packet4f& a) { return vec_ceil(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f> (const Packet4f& a) { return vec_floor(a); }
 template<> EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>   (const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; pstore(x, a); return x[0]; }
+template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { EIGEN_ALIGN16 float x[4]; pstore(x, a); return x[0]; }
 
 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from)
 {
diff --git a/Eigen/src/Core/functors/BinaryFunctors.h b/Eigen/src/Core/functors/BinaryFunctors.h
index a182b4b748a815fb75ff71780ea4f66e5f80dd19..63f09ab9317e31d26525916d8ca0a031fa7c1247 100644
--- a/Eigen/src/Core/functors/BinaryFunctors.h
+++ b/Eigen/src/Core/functors/BinaryFunctors.h
@@ -50,7 +50,7 @@ struct scalar_sum_op : binary_op_base<LhsScalar,RhsScalar>
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_sum_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2, // rough estimate!
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2, // rough estimate!
     PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
     // TODO vectorize mixed sum
   };
@@ -88,7 +88,7 @@ struct scalar_product_op  : binary_op_base<LhsScalar,RhsScalar>
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
+    Cost = (int(NumTraits<LhsScalar>::MulCost) + int(NumTraits<RhsScalar>::MulCost))/2, // rough estimate!
     PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
     // TODO vectorize mixed product
   };
@@ -364,7 +364,7 @@ struct scalar_difference_op : binary_op_base<LhsScalar,RhsScalar>
 template<typename LhsScalar,typename RhsScalar>
 struct functor_traits<scalar_difference_op<LhsScalar,RhsScalar> > {
   enum {
-    Cost = (NumTraits<LhsScalar>::AddCost+NumTraits<RhsScalar>::AddCost)/2,
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2,
     PacketAccess = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
   };
 };
diff --git a/Eigen/src/Core/functors/StlFunctors.h b/Eigen/src/Core/functors/StlFunctors.h
index d2e7b5b032228c53fe6129c6f299e522f6440d33..4570c9b634bd920753aa4d9ab063c4ab49ab4ecd 100644
--- a/Eigen/src/Core/functors/StlFunctors.h
+++ b/Eigen/src/Core/functors/StlFunctors.h
@@ -12,6 +12,28 @@
 
 namespace Eigen {
 
+// Portable replacements for certain functors.
+namespace numext {
+
+template<typename T = void>
+struct equal_to {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
+    return lhs == rhs;
+  }
+};
+
+template<typename T = void>
+struct not_equal_to {
+  typedef bool result_type;
+  EIGEN_DEVICE_FUNC bool operator()(const T& lhs, const T& rhs) const {
+    return lhs != rhs;
+  }
+};
+
+}
+
+
 namespace internal {
 
 // default functor traits for STL functors:
@@ -68,10 +90,18 @@ template<typename T>
 struct functor_traits<std::equal_to<T> >
 { enum { Cost = 1, PacketAccess = false }; };
 
+template<typename T>
+struct functor_traits<numext::equal_to<T> >
+  : functor_traits<std::equal_to<T> > {};
+
 template<typename T>
 struct functor_traits<std::not_equal_to<T> >
 { enum { Cost = 1, PacketAccess = false }; };
 
+template<typename T>
+struct functor_traits<numext::not_equal_to<T> >
+  : functor_traits<std::not_equal_to<T> > {};
+
 #if (EIGEN_COMP_CXXVER < 11)
 // std::binder* are deprecated since c++11 and will be removed in c++17
 template<typename T>
diff --git a/Eigen/src/Core/functors/UnaryFunctors.h b/Eigen/src/Core/functors/UnaryFunctors.h
index c98fa573ceeb31e547edc5574eb3753fc40900a2..16136d185aa3c16cd7b7cff1fa97ca07c405ddf0 100644
--- a/Eigen/src/Core/functors/UnaryFunctors.h
+++ b/Eigen/src/Core/functors/UnaryFunctors.h
@@ -109,7 +109,7 @@ struct functor_traits<scalar_abs2_op<Scalar> >
 template<typename Scalar> struct scalar_conjugate_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
   EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
+  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return numext::conj(a); }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
 };
@@ -138,7 +138,7 @@ struct functor_traits<scalar_conjugate_op<Scalar> >
 template<typename Scalar> struct scalar_arg_op {
   EIGEN_EMPTY_STRUCT_CTOR(scalar_arg_op)
   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using numext::arg; return arg(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::arg(a); }
   template<typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
   { return internal::parg(a); }
diff --git a/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index 79367f19742a4bbb61168ea2c5598cac8e05bc2b..4c649a281df12aae096f1e20404fdda36b4ec87a 100644
--- a/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -44,7 +44,7 @@ inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff
 #endif // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
 
 #if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
-#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_SET_DEFAULT_L3_CACHE_SIZE
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
 #else
 #define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
 #endif // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
@@ -349,36 +349,6 @@ inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_
   computeProductBlockingSizes<LhsScalar,RhsScalar,1,Index>(k, m, n, num_threads);
 }
 
-#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD
-  #define CJMADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
-#else
-
-  // FIXME (a bit overkill maybe ?)
-
-  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
-    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
-    {
-      c = cj.pmadd(a,b,c);
-    }
-  };
-
-  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
-    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
-    {
-      t = b; t = cj.pmul(a,t); c = padd(c,t);
-    }
-  };
-
-  template<typename CJ, typename A, typename B, typename C, typename T>
-  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
-  {
-    gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
-  }
-
-  #define CJMADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
-//   #define CJMADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
-#endif
-
 template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
 struct RhsPanelHelper {
  private:
@@ -1673,8 +1643,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             EIGEN_GEBGP_ONESTEP(6);
             EIGEN_GEBGP_ONESTEP(7);
 
-            blB += pk*RhsProgress;
-            blA += pk*3*Traits::LhsProgress;
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 3 * int(Traits::LhsProgress);
 
             EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
           }
@@ -1885,8 +1855,8 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
             EIGEN_GEBGP_ONESTEP(6);
             EIGEN_GEBGP_ONESTEP(7);
 
-            blB += pk*RhsProgress;
-            blA += pk*2*Traits::LhsProgress;
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 2 * int(Traits::LhsProgress);
 
             EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
@@ -2060,14 +2030,14 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
 
               B_0 = blB[0];
               B_1 = blB[1];
-              CJMADD(cj,A0,B_0,C0,  B_0);
-              CJMADD(cj,A0,B_1,C1,  B_1);
-              
+              C0 = cj.pmadd(A0,B_0,C0);
+              C1 = cj.pmadd(A0,B_1,C1);
+
               B_0 = blB[2];
               B_1 = blB[3];
-              CJMADD(cj,A0,B_0,C2,  B_0);
-              CJMADD(cj,A0,B_1,C3,  B_1);
-              
+              C2 = cj.pmadd(A0,B_0,C2);
+              C3 = cj.pmadd(A0,B_1,C3);
+
               blB += 4;
             }
             res(i, j2 + 0) += alpha * C0;
@@ -2092,7 +2062,7 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
           {
             LhsScalar A0 = blA[k];
             RhsScalar B_0 = blB[k];
-            CJMADD(cj, A0, B_0, C0, B_0);
+            C0 = cj.pmadd(A0, B_0, C0);
           }
           res(i, j2) += alpha * C0;
         }
@@ -2101,8 +2071,6 @@ void gebp_kernel<LhsScalar,RhsScalar,Index,DataMapper,mr,nr,ConjugateLhs,Conjuga
   }
 
 
-#undef CJMADD
-
 // pack a block of the lhs
 // The traversal is as follow (mr==4):
 //   0  4  8 12 ...
@@ -2301,8 +2269,8 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
   bool gone_half = false, gone_quarter = false, gone_last = false;
 
   Index i = 0;
-  int pack = Pack1;
-  int psize = PacketSize;
+  Index pack = Pack1;
+  Index psize = PacketSize;
   while(pack>0)
   {
     Index remaining_rows = rows-i;
@@ -2322,21 +2290,21 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Pa
           {
             if (psize == PacketSize) {
               PacketBlock<Packet> kernel;
-              for (int p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
+              for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i+p+m, k);
               ptranspose(kernel);
-              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
+              for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel.packet[p]));
             } else if (HasHalf && psize == HalfPacketSize) {
               gone_half = true;
               PacketBlock<HalfPacket> kernel_half;
-              for (int p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
+              for (Index p = 0; p < psize; ++p) kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i+p+m, k);
               ptranspose(kernel_half);
-              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
+              for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_half.packet[p]));
             } else if (HasQuarter && psize == QuarterPacketSize) {
               gone_quarter = true;
               PacketBlock<QuarterPacket> kernel_quarter;
-              for (int p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
+              for (Index p = 0; p < psize; ++p) kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i+p+m, k);
               ptranspose(kernel_quarter);
-              for (int p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
+              for (Index p = 0; p < psize; ++p) pstore(blockA+count+m+(pack)*p, cj.pconj(kernel_quarter.packet[p]));
 	    }
           }
           count += psize*pack;
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrix.h b/Eigen/src/Core/products/GeneralMatrixMatrix.h
index caa65fcccf56cfbc0de60ae65c98e6975fbb89ea..73ddd260eb6f97ec570fec3cca3663e38d1ef886 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -59,9 +59,9 @@ typedef gebp_traits<LhsScalar,RhsScalar> Traits;
 
 typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
 static void run(Index rows, Index cols, Index depth,
-  const LhsScalar* _lhs, Index lhsStride,
-  const RhsScalar* _rhs, Index rhsStride,
-  ResScalar* _res, Index resIncr, Index resStride,
+  const LhsScalar* lhs_, Index lhsStride,
+  const RhsScalar* rhs_, Index rhsStride,
+  ResScalar* res_, Index resIncr, Index resStride,
   ResScalar alpha,
   level3_blocking<LhsScalar,RhsScalar>& blocking,
   GemmParallelInfo<Index>* info = 0)
@@ -69,9 +69,9 @@ static void run(Index rows, Index cols, Index depth,
   typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
   typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
   typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor,Unaligned,ResInnerStride> ResMapper;
-  LhsMapper lhs(_lhs, lhsStride);
-  RhsMapper rhs(_rhs, rhsStride);
-  ResMapper res(_res, resStride, resIncr);
+  LhsMapper lhs(lhs_, lhsStride);
+  RhsMapper rhs(rhs_, rhsStride);
+  ResMapper res(res_, resStride, resIncr);
 
   Index kc = blocking.kc();                   // cache block size along the K direction
   Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
diff --git a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 6ba0d9bdb8d1752af714b5c7e326af6bb5e9a8d8..c0b5d8050ddbf288f2dcc9142eaa4717476de2a1 100644
--- a/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -60,9 +60,9 @@ template <typename Index, typename LhsScalar, int LhsStorageOrder, bool Conjugat
 struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,UpLo,Version>
 {
   typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride,
-                                      ResScalar* _res, Index resIncr, Index resStride,
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs_, Index lhsStride,
+                                      const RhsScalar* rhs_, Index rhsStride,
+                                      ResScalar* res_, Index resIncr, Index resStride,
                                       const ResScalar& alpha, level3_blocking<LhsScalar,RhsScalar>& blocking)
   {
     typedef gebp_traits<LhsScalar,RhsScalar> Traits;
@@ -70,9 +70,9 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
     typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
     typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
     typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
-    RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride, resIncr);
+    LhsMapper lhs(lhs_,lhsStride);
+    RhsMapper rhs(rhs_,rhsStride);
+    ResMapper res(res_, resStride, resIncr);
 
     Index kc = blocking.kc();
     Index mc = (std::min)(size,blocking.mc());
@@ -113,7 +113,7 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
           gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc,
                (std::min)(size,i2), alpha, -1, -1, 0, 0);
 
-        sybb(_res+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
+        sybb(res_+resStride*i2 + resIncr*i2, resIncr, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha);
 
         if (UpLo==Upper)
         {
@@ -144,11 +144,11 @@ struct tribb_kernel
   enum {
     BlockSize  = meta_least_common_multiple<EIGEN_PLAIN_ENUM_MAX(mr,nr),EIGEN_PLAIN_ENUM_MIN(mr,nr)>::ret
   };
-  void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
+  void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha)
   {
     typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
     typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
-    ResMapper res(_res, resStride, resIncr);
+    ResMapper res(res_, resStride, resIncr);
     gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
     gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
 
diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h
index dfb6aebced46a246820c6f68efd15a400bfd30e2..974a047053f85cbc248ca0b012f20554e0a63bc5 100644
--- a/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -359,6 +359,11 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
          HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
   };
 
+  typedef typename make_unsigned<Index>::type UnsignedIndex;
+  const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
+  const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
+  const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
+
   Index i=0;
   for(; i<n8; i+=8)
   {
@@ -371,8 +376,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
               c6 = pset1<ResPacket>(ResScalar(0)),
               c7 = pset1<ResPacket>(ResScalar(0));
 
-    Index j=0;
-    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
     {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
 
@@ -393,7 +397,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
     ResScalar cc5 = predux(c5);
     ResScalar cc6 = predux(c6);
     ResScalar cc7 = predux(c7);
-    for(; j<cols; ++j)
+
+    for (Index j = fullColBlockEnd; j < cols; ++j)
     {
       RhsScalar b0 = rhs(j,0);
 
@@ -422,8 +427,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
               c2 = pset1<ResPacket>(ResScalar(0)),
               c3 = pset1<ResPacket>(ResScalar(0));
 
-    Index j=0;
-    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
     {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
 
@@ -436,7 +440,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
     ResScalar cc1 = predux(c1);
     ResScalar cc2 = predux(c2);
     ResScalar cc3 = predux(c3);
-    for(; j<cols; ++j)
+
+    for(Index j = fullColBlockEnd; j < cols; ++j)
     {
       RhsScalar b0 = rhs(j,0);
 
@@ -455,8 +460,7 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
     ResPacket c0 = pset1<ResPacket>(ResScalar(0)),
               c1 = pset1<ResPacket>(ResScalar(0));
 
-    Index j=0;
-    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
     {
       RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j,0);
 
@@ -465,7 +469,8 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
     }
     ResScalar cc0 = predux(c0);
     ResScalar cc1 = predux(c1);
-    for(; j<cols; ++j)
+
+    for(Index j = fullColBlockEnd; j < cols; ++j)
     {
       RhsScalar b0 = rhs(j,0);
 
@@ -480,15 +485,15 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
     ResPacket c0 = pset1<ResPacket>(ResScalar(0));
     ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
     ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
-    Index j=0;
-    for(; j+LhsPacketSize<=cols; j+=LhsPacketSize)
+
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize)
     {
       RhsPacket b0 = rhs.template load<RhsPacket,Unaligned>(j,0);
       c0 = pcj.pmadd(lhs.template load<LhsPacket,LhsAlignment>(i,j),b0,c0);
     }
     ResScalar cc0 = predux(c0);
     if (HasHalf) {
-      for(; j+LhsPacketSizeHalf<=cols; j+=LhsPacketSizeHalf)
+      for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf)
         {
           RhsPacketHalf b0 = rhs.template load<RhsPacketHalf,Unaligned>(j,0);
           c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf,LhsAlignment>(i,j),b0,c0_h);
@@ -496,14 +501,14 @@ EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void general_matrix_vector_product<Index,Lhs
       cc0 += predux(c0_h);
     }
     if (HasQuarter) {
-      for(; j+LhsPacketSizeQuarter<=cols; j+=LhsPacketSizeQuarter)
+      for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter)
         {
           RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter,Unaligned>(j,0);
           c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter,LhsAlignment>(i,j),b0,c0_q);
         }
       cc0 += predux(c0_q);
     }
-    for(; j<cols; ++j)
+    for (Index j = quarterColBlockEnd; j < cols; ++j)
     {
       cc0 += cj.pmul(lhs(i,j), rhs(j,0));
     }
diff --git a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 33ecf10f610d4515edee6402360a34927d8abf2e..86c8f3ee973aa96d265d5862573a7929b094c8ac 100644
--- a/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -43,7 +43,7 @@ struct symm_pack_lhs
       for(Index w=0; w<BlockRows; w++)
         blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
   }
-  void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
+  void operator()(Scalar* blockA, const Scalar* lhs_, Index lhsStride, Index cols, Index rows)
   {
     typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
     typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half QuarterPacket;
@@ -53,7 +53,7 @@ struct symm_pack_lhs
            HasHalf = (int)HalfPacketSize < (int)PacketSize,
            HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize};
 
-    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
+    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(lhs_,lhsStride);
     Index count = 0;
     //Index peeled_mc3 = (rows/Pack1)*Pack1;
     
@@ -101,11 +101,11 @@ template<typename Scalar, typename Index, int nr, int StorageOrder>
 struct symm_pack_rhs
 {
   enum { PacketSize = packet_traits<Scalar>::size };
-  void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
+  void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2)
   {
     Index end_k = k2 + rows;
     Index count = 0;
-    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
+    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(rhs_,rhsStride);
     Index packet_cols8 = nr>=8 ? (cols/8) * 8 : 0;
     Index packet_cols4 = nr>=4 ? (cols/4) * 4 : 0;
 
@@ -330,8 +330,8 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs
 
   static EIGEN_DONT_INLINE void run(
     Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* lhs_, Index lhsStride,
+    const Scalar* rhs_, Index rhsStride,
     Scalar* res,        Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
@@ -342,9 +342,9 @@ template <typename Scalar, typename Index,
           int ResInnerStride>
 EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor,ResInnerStride>::run(
     Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar* lhs_, Index lhsStride,
+    const Scalar* rhs_, Index rhsStride,
+    Scalar* res_,       Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     Index size = rows;
@@ -355,10 +355,10 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,t
     typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
     typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
     typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
-    LhsTransposeMapper lhs_transpose(_lhs,lhsStride);
-    RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride, resIncr);
+    LhsMapper lhs(lhs_,lhsStride);
+    LhsTransposeMapper lhs_transpose(lhs_,lhsStride);
+    RhsMapper rhs(rhs_,rhsStride);
+    ResMapper res(res_, resStride, resIncr);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -425,8 +425,8 @@ struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLh
 
   static EIGEN_DONT_INLINE void run(
     Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* lhs_, Index lhsStride,
+    const Scalar* rhs_, Index rhsStride,
     Scalar* res,        Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
@@ -437,9 +437,9 @@ template <typename Scalar, typename Index,
           int ResInnerStride>
 EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor,ResInnerStride>::run(
     Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar* lhs_, Index lhsStride,
+    const Scalar* rhs_, Index rhsStride,
+    Scalar* res_,       Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     Index size = cols;
@@ -448,8 +448,8 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
 
     typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
     typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
-    ResMapper res(_res,resStride, resIncr);
+    LhsMapper lhs(lhs_,lhsStride);
+    ResMapper res(res_,resStride, resIncr);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -466,7 +466,7 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,f
     {
       const Index actual_kc = (std::min)(k2+kc,size)-k2;
 
-      pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
+      pack_rhs(blockB, rhs_, rhsStride, actual_kc, cols, k2);
 
       // => GEPP
       for(Index i2=0; i2<rows; i2+=mc)
diff --git a/Eigen/src/Core/products/SelfadjointRank2Update.h b/Eigen/src/Core/products/SelfadjointRank2Update.h
index 09209f7338155f906b3c228a729862f08853eccc..f752a0bf093598a602abee2645a9dc34c4883470 100644
--- a/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -80,8 +80,8 @@ EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,U
   if (IsRowMajor)
     actualAlpha = numext::conj(actualAlpha);
 
-  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type UType;
-  typedef typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type VType;
+  typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), _ActualUType>::type>::type UType;
+  typedef typename internal::remove_all<typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), _ActualVType>::type>::type VType;
   internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
     (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
     ::run(_expression().const_cast_derived().data(),_expression().outerStride(),UType(actualU),VType(actualV),actualAlpha);
diff --git a/Eigen/src/Core/products/TriangularMatrixMatrix.h b/Eigen/src/Core/products/TriangularMatrixMatrix.h
index f0c60507ab10100e02933e253d25a03ff99256d7..ba605a1c2ac45751748f40b9804b709c19fba3cf 100644
--- a/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -18,10 +18,10 @@ namespace internal {
 // struct gemm_pack_lhs_triangular
 // {
 //   Matrix<Scalar,mr,mr,
-//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
+//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* lhs_, int lhsStride, int depth, int rows)
 //   {
 //     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-//     const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
+//     const_blas_data_mapper<Scalar, StorageOrder> lhs(lhs_,lhsStride);
 //     int count = 0;
 //     const int peeled_mc = (rows/mr)*mr;
 //     for(int i=0; i<peeled_mc; i+=mr)
@@ -96,8 +96,8 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
 
   static EIGEN_DONT_INLINE void run(
     Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* lhs_, Index lhsStride,
+    const Scalar* rhs_, Index rhsStride,
     Scalar* res,        Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
@@ -110,9 +110,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
                                                         LhsStorageOrder,ConjugateLhs,
                                                         RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
     Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar* lhs_, Index lhsStride,
+    const Scalar* rhs_, Index rhsStride,
+    Scalar* res_,       Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     // strip zeros
@@ -124,9 +124,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
     typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
     typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
     typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
-    RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride, resIncr);
+    LhsMapper lhs(lhs_,lhsStride);
+    RhsMapper rhs(rhs_,rhsStride);
+    ResMapper res(res_, resStride, resIncr);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
@@ -254,8 +254,8 @@ struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
 
   static EIGEN_DONT_INLINE void run(
     Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
+    const Scalar* lhs_, Index lhsStride,
+    const Scalar* rhs_, Index rhsStride,
     Scalar* res,        Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
 };
@@ -268,9 +268,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
                                                         LhsStorageOrder,ConjugateLhs,
                                                         RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run(
     Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* _res,       Index resIncr, Index resStride,
+    const Scalar* lhs_, Index lhsStride,
+    const Scalar* rhs_, Index rhsStride,
+    Scalar* res_,       Index resIncr, Index resStride,
     const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
   {
     const Index PacketBytes = packet_traits<Scalar>::size*sizeof(Scalar);
@@ -283,9 +283,9 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
     typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
     typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
     typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
-    LhsMapper lhs(_lhs,lhsStride);
-    RhsMapper rhs(_rhs,rhsStride);
-    ResMapper res(_res, resStride, resIncr);
+    LhsMapper lhs(lhs_,lhsStride);
+    RhsMapper rhs(rhs_,rhsStride);
+    ResMapper res(res_, resStride, resIncr);
 
     Index kc = blocking.kc();                   // cache block size along the K direction
     Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
diff --git a/Eigen/src/Core/products/TriangularMatrixVector.h b/Eigen/src/Core/products/TriangularMatrixVector.h
index 76bfa159cedc5db03aafe8c798024a762b483c47..0a25748e1b13788fa9844c342716d6d886a21d1c 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -26,30 +26,30 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
     HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
   };
-  static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha);
+  static EIGEN_DONT_INLINE  void run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride,
+                                     const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const RhsScalar& alpha);
 };
 
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
 EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
-  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const RhsScalar& alpha)
+  ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride,
+        const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const RhsScalar& alpha)
   {
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
-    Index size = (std::min)(_rows,_cols);
-    Index rows = IsLower ? _rows : (std::min)(_rows,_cols);
-    Index cols = IsLower ? (std::min)(_rows,_cols) : _cols;
+    Index size = (std::min)(rows_,cols_);
+    Index rows = IsLower ? rows_ : (std::min)(rows_,cols_);
+    Index cols = IsLower ? (std::min)(rows_,cols_) : cols_;
 
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
-    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride));
     typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
 
     typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
-    const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
+    const RhsMap rhs(rhs_,cols,InnerStride<>(rhsIncr));
     typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
 
     typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
-    ResMap res(_res,rows);
+    ResMap res(res_,rows);
 
     typedef const_blas_data_mapper<LhsScalar,Index,ColMajor> LhsMapper;
     typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
@@ -84,7 +84,7 @@ EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,Con
           rows, cols-size,
           LhsMapper(&lhs.coeffRef(0,size), lhsStride),
           RhsMapper(&rhs.coeffRef(size), rhsIncr),
-          _res, resIncr, alpha);
+          res_, resIncr, alpha);
     }
   }
 
@@ -97,30 +97,30 @@ struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,C
     HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
     HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
   };
-  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                    const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+  static EIGEN_DONT_INLINE void run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride,
+                                    const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const ResScalar& alpha);
 };
 
 template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
 EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
-  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
+  ::run(Index rows_, Index cols_, const LhsScalar* lhs_, Index lhsStride,
+        const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr, const ResScalar& alpha)
   {
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
-    Index diagSize = (std::min)(_rows,_cols);
-    Index rows = IsLower ? _rows : diagSize;
-    Index cols = IsLower ? diagSize : _cols;
+    Index diagSize = (std::min)(rows_,cols_);
+    Index rows = IsLower ? rows_ : diagSize;
+    Index cols = IsLower ? diagSize : cols_;
 
     typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
-    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
+    const LhsMap lhs(lhs_,rows,cols,OuterStride<>(lhsStride));
     typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
 
     typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
-    const RhsMap rhs(_rhs,cols);
+    const RhsMap rhs(rhs_,cols);
     typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
 
     typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
-    ResMap res(_res,rows,InnerStride<>(resIncr));
+    ResMap res(res_,rows,InnerStride<>(resIncr));
 
     typedef const_blas_data_mapper<LhsScalar,Index,RowMajor> LhsMapper;
     typedef const_blas_data_mapper<RhsScalar,Index,RowMajor> RhsMapper;
diff --git a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
index 3d47a2b94cbc22de5a5295c38e9ec5bdbf353930..0f8d3a1dac5a7fa406858151e759586d0224f0cb 100644
--- a/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
+++ b/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -50,18 +50,18 @@ struct triangular_matrix_vector_product_trmv :
 #define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar) \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
- static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
-                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+ static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \
+                                     const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \
       triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
-        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+        rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
   } \
 }; \
 template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
 struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
- static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
-                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
+ static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, \
+                                     const Scalar* rhs_, Index rhsIncr, Scalar* res_, Index resIncr, Scalar alpha) { \
       triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
-        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+        rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
   } \
 };
 
@@ -81,23 +81,23 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
     IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
     LowUp = IsLower ? Lower : Upper \
   }; \
- static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
-                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, \
+                 const EIGTYPE* rhs_, Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) \
  { \
    if (ConjLhs || IsZeroDiag) { \
      triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
-       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+       rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
      return; \
    }\
-   Index size = (std::min)(_rows,_cols); \
-   Index rows = IsLower ? _rows : size; \
-   Index cols = IsLower ? size : _cols; \
+   Index size = (std::min)(rows_,cols_); \
+   Index rows = IsLower ? rows_ : size; \
+   Index cols = IsLower ? size : cols_; \
 \
    typedef VectorX##EIGPREFIX VectorRhs; \
    EIGTYPE *x, *y;\
 \
 /* Set x*/ \
-   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \
    VectorRhs x_tmp; \
    if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
    x = x_tmp.data(); \
@@ -121,24 +121,24 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)res_, &incy); \
 /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
      x = x_tmp.data(); \
      if (size<rows) { \
-       y = _res + size*resIncr; \
-       a = _lhs + size; \
+       y = res_ + size*resIncr; \
+       a = lhs_ + size; \
        m = convert_index<BlasIndex>(rows-size); \
        n = convert_index<BlasIndex>(size); \
      } \
      else { \
        x += size; \
-       y = _res; \
-       a = _lhs + size*lda; \
+       y = res_; \
+       a = lhs_ + size*lda; \
        m = convert_index<BlasIndex>(size); \
        n = convert_index<BlasIndex>(cols-size); \
      } \
@@ -170,23 +170,23 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
     IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
     LowUp = IsLower ? Lower : Upper \
   }; \
- static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
-                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
+ static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, \
+                 const EIGTYPE* rhs_, Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) \
  { \
    if (IsZeroDiag) { \
      triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
-       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
+       rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha); \
      return; \
    }\
-   Index size = (std::min)(_rows,_cols); \
-   Index rows = IsLower ? _rows : size; \
-   Index cols = IsLower ? size : _cols; \
+   Index size = (std::min)(rows_,cols_); \
+   Index rows = IsLower ? rows_ : size; \
+   Index cols = IsLower ? size : cols_; \
 \
    typedef VectorX##EIGPREFIX VectorRhs; \
    EIGTYPE *x, *y;\
 \
 /* Set x*/ \
-   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
+   Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_,cols,InnerStride<>(rhsIncr)); \
    VectorRhs x_tmp; \
    if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
    x = x_tmp.data(); \
@@ -210,24 +210,24 @@ struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,
    diag = IsUnitDiag ? 'U' : 'N'; \
 \
 /* call ?TRMV*/ \
-   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)_lhs, &lda, (BLASTYPE*)x, &incx); \
+   BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx); \
 \
 /* Add op(a_tr)rhs into res*/ \
-   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)_res, &incy); \
+   BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha),(const BLASTYPE*)x, &incx, (BLASTYPE*)res_, &incy); \
 /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/ \
    if (size<(std::max)(rows,cols)) { \
      if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
      x = x_tmp.data(); \
      if (size<rows) { \
-       y = _res + size*resIncr; \
-       a = _lhs + size*lda; \
+       y = res_ + size*resIncr; \
+       a = lhs_ + size*lda; \
        m = convert_index<BlasIndex>(rows-size); \
        n = convert_index<BlasIndex>(size); \
      } \
      else { \
        x += size; \
-       y = _res; \
-       a = _lhs + size; \
+       y = res_; \
+       a = lhs_ + size; \
        m = convert_index<BlasIndex>(size); \
        n = convert_index<BlasIndex>(cols-size); \
      } \
diff --git a/Eigen/src/Core/util/BlasUtil.h b/Eigen/src/Core/util/BlasUtil.h
index c5161022cd29626ef2e828db3c195f4e3dc07e6c..e16a564980726ca200dcaf51e15adc72f89876d5 100755
--- a/Eigen/src/Core/util/BlasUtil.h
+++ b/Eigen/src/Core/util/BlasUtil.h
@@ -39,90 +39,6 @@ template<typename Index,
          typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version=Specialized>
 struct general_matrix_vector_product;
 
-
-template<bool Conjugate> struct conj_if;
-
-template<> struct conj_if<true> {
-  template<typename T>
-  inline T operator()(const T& x) const { return numext::conj(x); }
-  template<typename T>
-  inline T pconj(const T& x) const { return internal::pconj(x); }
-};
-
-template<> struct conj_if<false> {
-  template<typename T>
-  inline const T& operator()(const T& x) const { return x; }
-  template<typename T>
-  inline const T& pconj(const T& x) const { return x; }
-};
-
-// Generic implementation for custom complex types.
-template<typename LhsScalar, typename RhsScalar, bool ConjLhs, bool ConjRhs>
-struct conj_helper
-{
-  typedef typename ScalarBinaryOpTraits<LhsScalar,RhsScalar>::ReturnType Scalar;
-
-  EIGEN_STRONG_INLINE Scalar pmadd(const LhsScalar& x, const RhsScalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const LhsScalar& x, const RhsScalar& y) const
-  { return conj_if<ConjLhs>()(x) *  conj_if<ConjRhs>()(y); }
-};
-
-template<typename Scalar> struct conj_helper<Scalar,Scalar,false,false>
-{
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); }
-};
-
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, false,true>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); }
-};
-
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,false>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
-};
-
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,true>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
-
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
-};
-
-template<typename RealScalar,bool Conj> struct conj_helper<std::complex<RealScalar>, RealScalar, Conj,false>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const
-  { return conj_if<Conj>()(x)*y; }
-};
-
-template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::complex<RealScalar>, false,Conj>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-  EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const
-  { return x*conj_if<Conj>()(y); }
-};
-
 template<typename From,typename To> struct get_factor {
   EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };
@@ -602,7 +518,7 @@ struct blas_traits<const T>
 
 template<typename T, bool HasUsableDirectAccess=blas_traits<T>::HasUsableDirectAccess>
 struct extract_data_selector {
-  static const typename T::Scalar* run(const T& m)
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m)
   {
     return blas_traits<T>::extract(m).data();
   }
@@ -613,7 +529,8 @@ struct extract_data_selector<T,false> {
   static typename T::Scalar* run(const T&) { return 0; }
 };
 
-template<typename T> const typename T::Scalar* extract_data(const T& m)
+template<typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m)
 {
   return extract_data_selector<T>::run(m);
 }
diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h
index af4e69623830aece11ec266155ce0a878cdb204e..259ef0ca18ce702629474d6fc72ef11c77ffecb1 100644
--- a/Eigen/src/Core/util/ConfigureVectorization.h
+++ b/Eigen/src/Core/util/ConfigureVectorization.h
@@ -339,7 +339,7 @@
     extern "C" {
       // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
       // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if EIGEN_COMP_ICC >= 1110
+      #if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
         #include <immintrin.h>
       #else
         #include <mmintrin.h>
@@ -438,13 +438,15 @@
   #include <arm_fp16.h>
 #endif
 
-#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!defined(EIGEN_COMP_CLANG) || EIGEN_COMP_CLANG>=380))
+#if defined(__F16C__) && (!defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG || EIGEN_COMP_CLANG>=380))
   // We can use the optimized fp16 to float and float to fp16 conversion routines
   #define EIGEN_HAS_FP16_C
 
-  #if defined(EIGEN_COMP_CLANG)
-    // Workaround for clang: The FP16C intrinsics for clang are included by
-    // immintrin.h, as opposed to emmintrin.h as suggested by Intel:
+  #if EIGEN_COMP_GNUC
+    // Make sure immintrin.h is included, even if e.g. vectorization is
+    // explicitly disabled (see also issue #2395).
+    // Note that FP16C intrinsics for gcc and clang are included by immintrin.h,
+    // as opposed to emmintrin.h as suggested by Intel:
     // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
     #include <immintrin.h>
   #endif
diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h
index f7f907ab742766dcc8c6715fea4a23dbfe25081a..0667b1c44e1dc25aab8b9702dc5a04dc954be87c 100644
--- a/Eigen/src/Core/util/Constants.h
+++ b/Eigen/src/Core/util/Constants.h
@@ -134,7 +134,7 @@ const unsigned int LinearAccessBit = 0x10;
   * Means the expression has a coeffRef() method, i.e. is writable as its individual coefficients are directly addressable.
   * This rules out read-only expressions.
   *
-  * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but note
+  * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but not
   * the other:
   *   \li writable expressions that don't have a very simple memory layout as a strided array, have LvalueBit but not DirectAccessBit
   *   \li Map-to-const expressions, for example Map<const Matrix>, have DirectAccessBit but not LvalueBit
@@ -157,7 +157,7 @@ const unsigned int DirectAccessBit = 0x40;
 /** \deprecated \ingroup flags
   *
   * means the first coefficient packet is guaranteed to be aligned.
-  * An expression cannot has the AlignedBit without the PacketAccessBit flag.
+  * An expression cannot have the AlignedBit without the PacketAccessBit flag.
   * In other words, this means we are allow to perform an aligned packet access to the first element regardless
   * of the expression kind:
   * \code
diff --git a/Eigen/src/Core/util/DisableStupidWarnings.h b/Eigen/src/Core/util/DisableStupidWarnings.h
index fe0cfec0bc2461ac44abca8f3d05b468d3c60fd9..0865fb698d593b57ebfaa6aa838beccca5e4007a 100755
--- a/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -1,9 +1,10 @@
 #ifndef EIGEN_WARNINGS_DISABLED
 #define EIGEN_WARNINGS_DISABLED
 
-#ifdef _MSC_VER
+#if defined(_MSC_VER)
   // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
   // 4101 - unreferenced local variable
+  // 4127 - conditional expression is constant
   // 4181 - qualifier applied to reference type ignored
   // 4211 - nonstandard extension used : redefined extern to static
   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
@@ -19,7 +20,7 @@
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma warning( push )
   #endif
-  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
 
 #elif defined __INTEL_COMPILER
   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
@@ -35,25 +36,28 @@
   #pragma warning disable 2196 279 1684 2259
 
 #elif defined __clang__
-  // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
-  //     this is really a stupid warning as it warns on compile-time expressions involving enums
   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
     #pragma clang diagnostic push
   #endif
-  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
-  #if __clang_major__ >= 3 && __clang_minor__ >= 5
-    #pragma clang diagnostic ignored "-Wabsolute-value"
-  #endif
-  #if __clang_major__ >= 10
-    #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
-  #endif
-  #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
-    // warning: generic selections are a C11-specific feature
-    // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
-    #pragma clang diagnostic ignored "-Wc11-extensions"
+  #if defined(__has_warning)
+    // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+    //     this is really a stupid warning as it warns on compile-time expressions involving enums
+    #if __has_warning("-Wconstant-logical-operand")
+      #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+    #endif
+    #if __has_warning("-Wimplicit-int-float-conversion")
+      #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
+    #endif
+    #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
+      // warning: generic selections are a C11-specific feature
+      // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+      #if __has_warning("-Wc11-extensions")
+        #pragma clang diagnostic ignored "-Wc11-extensions"
+      #endif
+    #endif
   #endif
 
-#elif defined __GNUC__
+#elif defined __GNUC__ && !defined(__FUJITSU)
 
   #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
     #pragma GCC diagnostic push
@@ -74,25 +78,53 @@
 #endif
 
 #if defined __NVCC__
-  #pragma diag_suppress boolean_controlling_expr_is_constant
+  // MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
+  // we instead use Microsoft's __pragma extension.
+  #if defined _MSC_VER
+    #define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
+  #else
+    #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X)
+  #endif
+  #if defined __NVCC_DIAG_PRAGMA_SUPPORT__
+    #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X)
+  #else
+    #define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X)
+  #endif
+
+  EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant)
   // Disable the "statement is unreachable" message
-  #pragma diag_suppress code_is_unreachable
+  EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable)
   // Disable the "dynamic initialization in unreachable code" message
-  #pragma diag_suppress initialization_not_reachable
+  EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable)
   // Disable the "invalid error number" message that we get with older versions of nvcc
-  #pragma diag_suppress 1222
+  EIGEN_NV_DIAG_SUPPRESS(1222)
   // Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are many of them and they seem to change with every version of the compiler)
-  #pragma diag_suppress 2527
-  #pragma diag_suppress 2529
-  #pragma diag_suppress 2651
-  #pragma diag_suppress 2653
-  #pragma diag_suppress 2668
-  #pragma diag_suppress 2669
-  #pragma diag_suppress 2670
-  #pragma diag_suppress 2671
-  #pragma diag_suppress 2735
-  #pragma diag_suppress 2737
-  #pragma diag_suppress 2739
+  EIGEN_NV_DIAG_SUPPRESS(2527)
+  EIGEN_NV_DIAG_SUPPRESS(2529)
+  EIGEN_NV_DIAG_SUPPRESS(2651)
+  EIGEN_NV_DIAG_SUPPRESS(2653)
+  EIGEN_NV_DIAG_SUPPRESS(2668)
+  EIGEN_NV_DIAG_SUPPRESS(2669)
+  EIGEN_NV_DIAG_SUPPRESS(2670)
+  EIGEN_NV_DIAG_SUPPRESS(2671)
+  EIGEN_NV_DIAG_SUPPRESS(2735)
+  EIGEN_NV_DIAG_SUPPRESS(2737)
+  EIGEN_NV_DIAG_SUPPRESS(2739)
+  EIGEN_NV_DIAG_SUPPRESS(2885)
+  EIGEN_NV_DIAG_SUPPRESS(2888)
+  EIGEN_NV_DIAG_SUPPRESS(2976)
+  EIGEN_NV_DIAG_SUPPRESS(2979)
+  EIGEN_NV_DIAG_SUPPRESS(20011)
+  EIGEN_NV_DIAG_SUPPRESS(20014)
+  // Disable the "// __device__ annotation is ignored on a function(...) that is
+  //              explicitly defaulted on its first declaration" message.
+  // The __device__ annotation seems to actually be needed in some cases,
+  // otherwise resulting in kernel runtime errors.
+  EIGEN_NV_DIAG_SUPPRESS(2886)
+  EIGEN_NV_DIAG_SUPPRESS(2977)
+  EIGEN_NV_DIAG_SUPPRESS(20012)
+  #undef EIGEN_NV_DIAG_SUPPRESS
+  #undef EIGEN_MAKE_PRAGMA
 #endif
 
 #else
diff --git a/Eigen/src/Core/util/IntegralConstant.h b/Eigen/src/Core/util/IntegralConstant.h
index ef3fdfb9489f7ab2a6982c50c567429a8bcb4d26..e0092f654d9ca5138d475fc8f3a2a51c23e677b7 100644
--- a/Eigen/src/Core/util/IntegralConstant.h
+++ b/Eigen/src/Core/util/IntegralConstant.h
@@ -77,7 +77,7 @@ public:
   template<int M>
   FixedInt<N&M> operator&( FixedInt<M>) const { return FixedInt<N&M>(); }
 
-#if EIGEN_HAS_CXX14
+#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
   // Needed in C++14 to allow fix<N>():
   FixedInt operator() () const { return *this; }
 
@@ -138,7 +138,7 @@ template<int N,int Default> struct get_fixed_value<FixedInt<N>,Default> {
   static const int value = N;
 };
 
-#if !EIGEN_HAS_CXX14
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 template<int N,int Default> struct get_fixed_value<FixedInt<N> (*)(),Default> {
   static const int value = N;
 };
@@ -154,7 +154,7 @@ struct get_fixed_value<variable_if_dynamic<T,N>,Default> {
 };
 
 template<typename T> EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) { return x; }
-#if !EIGEN_HAS_CXX14
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 template<int N> EIGEN_DEVICE_FUNC Index get_runtime_value(FixedInt<N> (*)()) { return N; }
 #endif
 
@@ -166,7 +166,7 @@ template<typename T, int DynamicKey=Dynamic, typename EnableIf=void> struct clea
 // Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index
 template<typename T, int DynamicKey> struct cleanup_index_type<T,DynamicKey,typename internal::enable_if<internal::is_integral<T>::value>::type> { typedef Index type; };
 
-#if !EIGEN_HAS_CXX14
+#if !EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 // In c++98/c++11, fix<N> is a pointer to function that we better cleanup to a true FixedInt<N>:
 template<int N, int DynamicKey> struct cleanup_index_type<FixedInt<N> (*)(), DynamicKey> { typedef FixedInt<N> type; };
 #endif
@@ -184,7 +184,7 @@ template<int N, int DynamicKey> struct cleanup_index_type<std::integral_constant
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-#if EIGEN_HAS_CXX14
+#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
 template<int N>
 static const internal::FixedInt<N> fix{};
 #else
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index d2e84492834178ff4f71d49199700cc2069c9955..cd2dbe77a9f62f704dfa1513ee7391597d12dcca 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -16,8 +16,8 @@
 //------------------------------------------------------------------------------------------
 
 #define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 3
-#define EIGEN_MINOR_VERSION 90
+#define EIGEN_MAJOR_VERSION 4
+#define EIGEN_MINOR_VERSION 1
 
 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
                                       (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
@@ -162,8 +162,8 @@
 
 /// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++
 // XLC   version
-// 3.1   0x0301	
-// 4.5   0x0405	
+// 3.1   0x0301
+// 4.5   0x0405
 // 5.0   0x0500
 // 12.1  0x0C01
 #if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__)
@@ -275,7 +275,7 @@
 
 /// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE
 /// compliant Arm fp16 type
-#if EIGEN_ARCH_ARM64
+#if EIGEN_ARCH_ARM_OR_ARM64
   #ifndef EIGEN_HAS_ARM64_FP16
     #if defined(__ARM_FP16_FORMAT_IEEE)
       #define EIGEN_HAS_ARM64_FP16 1
@@ -285,28 +285,6 @@
   #endif
 #endif
 
-/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
-/// supports Neon vector intrinsics for fp16.
-#if EIGEN_ARCH_ARM64
-  #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
-    #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
-    #else
-      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
-    #endif
-  #endif
-#endif
-
-/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
-/// supports Neon scalar intrinsics for fp16.
-#if EIGEN_ARCH_ARM64
-  #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
-    #if defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC)
-      #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
-    #endif
-  #endif
-#endif
-
 /// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
 #if defined(__mips__) || defined(__mips)
   #define EIGEN_ARCH_MIPS 1
@@ -565,6 +543,32 @@
 //
 #endif
 
+/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
+/// supports Neon vector intrinsics for fp16.
+#if EIGEN_ARCH_ARM_OR_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+    // Clang only supports FP16 on aarch64, and not all intrinsics are available
+    // on A32 anyways even in GCC (e.g. vdiv_f16, vsqrt_f16).
+    #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
+    #else
+      #define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
+    #endif
+  #endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
+/// supports Neon scalar intrinsics for fp16.
+#if EIGEN_ARCH_ARM_OR_ARM64
+  #ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+    // Clang only supports FP16 on aarch64, and not all intrinsics are available
+    // on A32 anyways, even in GCC (e.g. vceqh_f16).
+    #if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+      #define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
+    #endif
+  #endif
+#endif
+
 #if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
 // EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro.
 // In most cases we want to check if both macros are defined which can be done using the define below.
@@ -637,6 +641,14 @@
   #define EIGEN_COMP_CXXVER 03
 #endif
 
+#ifndef EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
+  #if defined(__cpp_variable_templates) && __cpp_variable_templates >= 201304 && EIGEN_MAX_CPP_VER>=14
+    #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 1
+  #else
+    #define EIGEN_HAS_CXX14_VARIABLE_TEMPLATES 0
+  #endif
+#endif
+
 
 // The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features
 // but in practice we should not rely on them but rather on the availabilty of
@@ -833,7 +845,7 @@
   #endif
 #endif
 
-// NOTE: the required Apple's clang version is very conservative 
+// NOTE: the required Apple's clang version is very conservative
 //       and it could be that XCode 9 works just fine.
 // NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support
 //       and not tested.
@@ -962,7 +974,7 @@
   #endif
   #define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline))
 // All functions callable from CUDA/HIP code must be qualified with __device__
-#elif defined(EIGEN_GPUCC) 
+#elif defined(EIGEN_GPUCC)
     #define EIGEN_DEVICE_FUNC __host__ __device__
 #else
   #define EIGEN_DEVICE_FUNC
@@ -989,7 +1001,7 @@
   #else
     #define eigen_plain_assert(x)
   #endif
-#else 
+#else
   #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO
     namespace Eigen {
     namespace internal {
@@ -1123,7 +1135,16 @@ namespace Eigen {
       #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,v,wa" (X));
     #elif EIGEN_ARCH_ARM_OR_ARM64
       // General, NEON.
-      #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,w" (X));
+      // Clang doesn't like "r",
+      //    error: non-trivial scalar-to-vector conversion, possible invalid
+      //           constraint for vector type
+      // GCC < 5 doesn't like "g",
+      //    error: 'asm' operand requires impossible reload
+      #if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_MOST(5, 0)
+        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+r,w" (X));
+      #else
+        #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,w" (X));
+      #endif
     #elif EIGEN_ARCH_i386_OR_x86_64
       // General, SSE.
       #define EIGEN_OPTIMIZATION_BARRIER(X)  __asm__  ("" : "+g,x" (X));
@@ -1177,8 +1198,12 @@ namespace Eigen {
   #define EIGEN_USING_STD(FUNC) using std::FUNC;
 #endif
 
-#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_COMP_NVCC)
-  // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324)
+#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1916 || (EIGEN_COMP_MSVC == 1916 && EIGEN_COMP_NVCC))
+  // For older MSVC versions, as well as 1900 && CUDA 8, using the base operator is necessary,
+  //   otherwise we get duplicate definition errors
+  // For later MSVC versions, we require explicit operator= definition, otherwise we get
+  //   use of implicitly deleted operator errors.
+  // (cf Bugs 920, 1000, 1324, 2291)
   #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
     using Base::operator =;
 #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
@@ -1204,7 +1229,7 @@ namespace Eigen {
  * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
  */
 #if EIGEN_HAS_CXX11
-#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) CLASS(const CLASS&) = default;
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
 #else
 #define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS)
 #endif
@@ -1229,12 +1254,12 @@ namespace Eigen {
  */
 #if EIGEN_HAS_CXX11
 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
-    Derived() = default; \
-    ~Derived() = default;
+    EIGEN_DEVICE_FUNC Derived() = default; \
+    EIGEN_DEVICE_FUNC ~Derived() = default;
 #else
 #define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived)  \
-    Derived() {}; \
-    /* ~Derived() {}; */
+    EIGEN_DEVICE_FUNC Derived() {}; \
+    /* EIGEN_DEVICE_FUNC ~Derived() {}; */
 #endif
 
 
diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h
index 7cbe8a672ab310e2a996d5cb1f19f820f329889d..3aea7df51ac79952f6319a5931e8ac534b632307 100644
--- a/Eigen/src/Core/util/Memory.h
+++ b/Eigen/src/Core/util/Memory.h
@@ -292,20 +292,59 @@ template<typename T> EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T
 /** \internal Constructs the elements of an array.
   * The \a size parameter tells on how many objects to call the constructor of T.
   */
-template<typename T> EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size)
+template<typename T> EIGEN_DEVICE_FUNC inline T* default_construct_elements_of_array(T *ptr, std::size_t size)
 {
-  std::size_t i;
+  std::size_t i=0;
   EIGEN_TRY
   {
-      for (i = 0; i < size; ++i) ::new (ptr + i) T;
-      return ptr;
+    for (i = 0; i < size; ++i) ::new (ptr + i) T;
   }
   EIGEN_CATCH(...)
   {
     destruct_elements_of_array(ptr, i);
     EIGEN_THROW;
   }
-  return NULL;
+  return ptr;
+}
+
+/** \internal Copy-constructs the elements of an array.
+  * The \a size parameter tells on how many objects to copy.
+  */
+template<typename T> EIGEN_DEVICE_FUNC inline T* copy_construct_elements_of_array(T *ptr, const T* src, std::size_t size)
+{
+  std::size_t i=0;
+  EIGEN_TRY
+  {
+      for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i));
+  }
+  EIGEN_CATCH(...)
+  {
+    destruct_elements_of_array(ptr, i);
+    EIGEN_THROW;
+  }
+  return ptr;
+}
+
+/** \internal Move-constructs the elements of an array.
+  * The \a size parameter tells on how many objects to move.
+  */
+template<typename T> EIGEN_DEVICE_FUNC inline T* move_construct_elements_of_array(T *ptr, T* src, std::size_t size)
+{
+  std::size_t i=0;
+  EIGEN_TRY
+  {
+#if EIGEN_HAS_RVALUE_REFERENCES
+    for (i = 0; i < size; ++i) ::new (ptr + i) T(std::move(*(src + i)));
+#else
+    for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i));
+#endif
+  }
+  EIGEN_CATCH(...)
+  {
+    destruct_elements_of_array(ptr, i);
+    EIGEN_THROW;
+  }
+  return ptr;
 }
 
 /*****************************************************************************
@@ -326,10 +365,10 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t s
 template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
 {
   check_size_for_overflow<T>(size);
-  T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
+  T *result = static_cast<T*>(aligned_malloc(sizeof(T)*size));
   EIGEN_TRY
   {
-    return construct_elements_of_array(result, size);
+    return default_construct_elements_of_array(result, size);
   }
   EIGEN_CATCH(...)
   {
@@ -342,10 +381,10 @@ template<typename T> EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size)
 template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size)
 {
   check_size_for_overflow<T>(size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
+  T *result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
   EIGEN_TRY
   {
-    return construct_elements_of_array(result, size);
+    return default_construct_elements_of_array(result, size);
   }
   EIGEN_CATCH(...)
   {
@@ -377,21 +416,32 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
 {
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
-  if(new_size < old_size)
-    destruct_elements_of_array(pts+new_size, old_size-new_size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
-  if(new_size > old_size)
+  
+  // If elements need to be explicitly initialized, we cannot simply realloc
+  // (or memcpy) the memory block - each element needs to be reconstructed.
+  // Otherwise, objects that contain internal pointers like mpfr or
+  // AnnoyingScalar can be pointing to the wrong thing.
+  T* result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*new_size));
+  EIGEN_TRY
   {
-    EIGEN_TRY
-    {
-      construct_elements_of_array(result+old_size, new_size-old_size);
-    }
-    EIGEN_CATCH(...)
-    {
-      conditional_aligned_free<Align>(result);
-      EIGEN_THROW;
+    // Move-construct initial elements.
+    std::size_t copy_size = (std::min)(old_size, new_size);
+    move_construct_elements_of_array(result, pts, copy_size);
+    
+    // Default-construct remaining elements.
+    if (new_size > old_size) {
+      default_construct_elements_of_array(result + copy_size, new_size - old_size);
     }
+    
+    // Delete old elements.
+    conditional_aligned_delete<T, Align>(pts, old_size);      
   }
+  EIGEN_CATCH(...)
+  {
+    conditional_aligned_free<Align>(result);
+    EIGEN_THROW;
+  }
+
   return result;
 }
 
@@ -401,12 +451,12 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
   if(size==0)
     return 0; // short-cut. Also fixes Bug 884
   check_size_for_overflow<T>(size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
+  T *result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
   if(NumTraits<T>::RequireInitialization)
   {
     EIGEN_TRY
     {
-      construct_elements_of_array(result, size);
+      default_construct_elements_of_array(result, size);
     }
     EIGEN_CATCH(...)
     {
@@ -419,24 +469,13 @@ template<typename T, bool Align> EIGEN_DEVICE_FUNC inline T* conditional_aligned
 
 template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size)
 {
+  if (NumTraits<T>::RequireInitialization) {
+    return conditional_aligned_realloc_new<T, Align>(pts, new_size, old_size);
+  }
+  
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
-  if(NumTraits<T>::RequireInitialization && (new_size < old_size))
-    destruct_elements_of_array(pts+new_size, old_size-new_size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
-  if(NumTraits<T>::RequireInitialization && (new_size > old_size))
-  {
-    EIGEN_TRY
-    {
-      construct_elements_of_array(result+old_size, new_size-old_size);
-    }
-    EIGEN_CATCH(...)
-    {
-      conditional_aligned_free<Align>(result);
-      EIGEN_THROW;
-    }
-  }
-  return result;
+  return static_cast<T*>(conditional_aligned_realloc<Align>(static_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
 }
 
 template<typename T, bool Align> EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size)
@@ -566,6 +605,17 @@ template<typename T> struct smart_memmove_helper<T,false> {
   }
 };
 
+#if EIGEN_HAS_RVALUE_REFERENCES
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::move(start, end, target);
+}
+#else
+template<typename T> EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target)
+{
+  return std::copy(start, end, target);
+}
+#endif
 
 /*****************************************************************************
 *** Implementation of runtime stack allocation (falling back to malloc)    ***
@@ -606,7 +656,7 @@ template<typename T> class aligned_stack_memory_handler : noncopyable
       : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
     {
       if(NumTraits<T>::RequireInitialization && m_ptr)
-        Eigen::internal::construct_elements_of_array(m_ptr, size);
+        Eigen::internal::default_construct_elements_of_array(m_ptr, size);
     }
     EIGEN_DEVICE_FUNC
     ~aligned_stack_memory_handler()
@@ -657,7 +707,7 @@ struct local_nested_eval_wrapper<Xpr,NbEvaluations,true>
       m_deallocate(ptr==0)
   {
     if(NumTraits<Scalar>::RequireInitialization && object.data())
-      Eigen::internal::construct_elements_of_array(object.data(), object.size());
+      Eigen::internal::default_construct_elements_of_array(object.data(), object.size());
     object = xpr;
   }
 
diff --git a/Eigen/src/Core/util/Meta.h b/Eigen/src/Core/util/Meta.h
index f66325f898d2f8d354a07bc3ee797929aaa83b6b..b7635f985291b20c0f9df8f5545d48056c0d3caf 100755
--- a/Eigen/src/Core/util/Meta.h
+++ b/Eigen/src/Core/util/Meta.h
@@ -133,7 +133,10 @@ template<typename T> struct remove_all<T*>        { typedef typename remove_all<
 template<typename T> struct is_arithmetic      { enum { value = false }; };
 template<> struct is_arithmetic<float>         { enum { value = true }; };
 template<> struct is_arithmetic<double>        { enum { value = true }; };
+// GPU devices treat `long double` as `double`.
+#ifndef EIGEN_GPU_COMPILE_PHASE
 template<> struct is_arithmetic<long double>   { enum { value = true }; };
+#endif
 template<> struct is_arithmetic<bool>          { enum { value = true }; };
 template<> struct is_arithmetic<char>          { enum { value = true }; };
 template<> struct is_arithmetic<signed char>   { enum { value = true }; };
@@ -189,21 +192,9 @@ template<> struct make_unsigned<signed int>       { typedef unsigned int type; }
 template<> struct make_unsigned<unsigned int>     { typedef unsigned int type; };
 template<> struct make_unsigned<signed long>      { typedef unsigned long type; };
 template<> struct make_unsigned<unsigned long>    { typedef unsigned long type; };
-#if EIGEN_COMP_MSVC
-template<> struct make_unsigned<signed __int64>   { typedef unsigned __int64 type; };
-template<> struct make_unsigned<unsigned __int64> { typedef unsigned __int64 type; };
-#endif
-
-// Some platforms define int64_t as long long even for C++03. In this case we
-// are missing the definition for make_unsigned. If we just define it, we get
-// duplicated definitions for platforms defining int64_t as signed long for
-// C++03. We therefore add the specialization for C++03 long long for these
-// platforms only.
-#if EIGEN_OS_MAC
 template<> struct make_unsigned<unsigned long long> { typedef unsigned long long type; };
 template<> struct make_unsigned<long long>          { typedef unsigned long long type; };
 #endif
-#endif
 
 template <typename T> struct add_const { typedef const T type; };
 template <typename T> struct add_const<T&> { typedef T& type; };
@@ -466,20 +457,32 @@ template<typename T, std::size_t N> struct array_size<std::array<T,N> > {
 };
 #endif
 
+
 /** \internal
-  * Analogue of the std::size free function.
-  * It returns the size of the container or view \a x of type \c T
+  * Analogue of the std::ssize free function.
+  * It returns the signed size of the container or view \a x of type \c T
   *
   * It currently supports:
   *  - any types T defining a member T::size() const
   *  - plain C arrays as T[N]
   *
+  * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function.
   */
-template<typename T>
-EIGEN_CONSTEXPR Index size(const T& x) { return x.size(); }
+#if EIGEN_COMP_CXXVER < 20
+template <typename T>
+EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T& x) {
+  return static_cast<std::ptrdiff_t>(x.size());
+}
 
-template<typename T,std::size_t N>
-EIGEN_CONSTEXPR Index size(const T (&) [N]) { return N; }
+template<typename T, std::ptrdiff_t N>
+EIGEN_CONSTEXPR std::ptrdiff_t index_list_size(const T (&)[N]) { return N; }
+#else
+template <typename T>
+EIGEN_CONSTEXPR auto index_list_size(T&& x) {
+  using std::ssize;
+  return ssize(std::forward<T>(x));
+}
+#endif // EIGEN_COMP_CXXVER
 
 /** \internal
   * Convenient struct to get the result type of a nullary, unary, binary, or
@@ -696,8 +699,7 @@ struct has_binary_operator
 template<int Y,
          int InfX = 0,
          int SupX = ((Y==1) ? 1 : Y/2),
-         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
-                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
+         bool Done = ((SupX - InfX) <= 1 || ((SupX * SupX <= Y) && ((SupX + 1) * (SupX + 1) > Y)))>
 class meta_sqrt
 {
     enum {
@@ -715,20 +717,25 @@ class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ?
 
 
 /** \internal Computes the least common multiple of two positive integer A and B
-  * at compile-time. It implements a naive algorithm testing all multiples of A.
-  * It thus works better if A>=B.
+  * at compile-time. 
   */
-template<int A, int B, int K=1, bool Done = ((A*K)%B)==0>
+template<int A, int B, int K=1, bool Done = ((A*K)%B)==0, bool Big=(A>=B)>
 struct meta_least_common_multiple
 {
   enum { ret = meta_least_common_multiple<A,B,K+1>::ret };
 };
+template<int A, int B, int K, bool Done>
+struct meta_least_common_multiple<A,B,K,Done,false>
+{
+  enum { ret = meta_least_common_multiple<B,A,K>::ret };
+};
 template<int A, int B, int K>
-struct meta_least_common_multiple<A,B,K,true>
+struct meta_least_common_multiple<A,B,K,true,true>
 {
   enum { ret = A*K };
 };
 
+
 /** \internal determines whether the product of two numeric types is allowed and what the return type is */
 template<typename T, typename U> struct scalar_product_traits
 {
diff --git a/Eigen/src/Core/util/XprHelper.h b/Eigen/src/Core/util/XprHelper.h
index 2c63a95243c4ae1ab3044f5146172ee3e6ca52e4..71c32b8a116ba86bf04a00e8c260ef912154a01a 100644
--- a/Eigen/src/Core/util/XprHelper.h
+++ b/Eigen/src/Core/util/XprHelper.h
@@ -184,19 +184,7 @@ template<typename T> struct functor_traits
 
 template<typename T> struct packet_traits;
 
-template<typename T> struct unpacket_traits
-{
-  typedef T type;
-  typedef T half;
-  enum
-  {
-    size = 1,
-    alignment = 1,
-    vectorizable = false,
-    masked_load_available=false,
-    masked_store_available=false
-  };
-};
+template<typename T> struct unpacket_traits;
 
 template<int Size, typename PacketType,
          bool Stop = Size==Dynamic || (Size%unpacket_traits<PacketType>::size)==0 || is_same<PacketType,typename unpacket_traits<PacketType>::half>::value>
@@ -611,9 +599,9 @@ template<typename ExpressionType, typename Scalar = typename ExpressionType::Sca
 struct plain_row_type
 {
   typedef Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType;
+                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType;
   typedef Array<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType;
+                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType;
 
   typedef typename conditional<
     is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
diff --git a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index 87d789b3f4a815c113c1cbcdaeef1ac7b8c74f29..26324cee9d91576b6ee0e9a8de5ee9fda8907cd2 100644
--- a/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -119,8 +119,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       : m_eivec(),
         m_alphas(),
         m_betas(),
-        m_valuesOkay(false),
-        m_vectorsOkay(false),
+        m_computeEigenvectors(false),
+        m_isInitialized(false),
         m_realQZ()
     {}
 
@@ -134,8 +134,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       : m_eivec(size, size),
         m_alphas(size),
         m_betas(size),
-        m_valuesOkay(false),
-        m_vectorsOkay(false),
+        m_computeEigenvectors(false),
+        m_isInitialized(false),
         m_realQZ(size),
         m_tmp(size)
     {}
@@ -156,8 +156,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       : m_eivec(A.rows(), A.cols()),
         m_alphas(A.cols()),
         m_betas(A.cols()),
-        m_valuesOkay(false),
-        m_vectorsOkay(false),
+        m_computeEigenvectors(false),
+        m_isInitialized(false),
         m_realQZ(A.cols()),
         m_tmp(A.cols())
     {
@@ -177,7 +177,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       * \sa eigenvalues()
       */
     EigenvectorsType eigenvectors() const {
-      eigen_assert(m_vectorsOkay && "Eigenvectors for GeneralizedEigenSolver were not calculated.");
+      eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvectors");
+      eigen_assert(m_computeEigenvectors && "Eigenvectors for GeneralizedEigenSolver were not calculated");
       return m_eivec;
     }
 
@@ -201,7 +202,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       */
     EigenvalueType eigenvalues() const
     {
-      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvalues.");
       return EigenvalueType(m_alphas,m_betas);
     }
 
@@ -212,7 +213,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       * \sa betas(), eigenvalues() */
     ComplexVectorType alphas() const
     {
-      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute alphas.");
       return m_alphas;
     }
 
@@ -223,7 +224,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
       * \sa alphas(), eigenvalues() */
     VectorType betas() const
     {
-      eigen_assert(m_valuesOkay && "GeneralizedEigenSolver is not initialized.");
+      eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute betas.");
       return m_betas;
     }
 
@@ -254,7 +255,7 @@ template<typename _MatrixType> class GeneralizedEigenSolver
 
     ComputationInfo info() const
     {
-      eigen_assert(m_valuesOkay && "EigenSolver is not initialized.");
+      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
       return m_realQZ.info();
     }
 
@@ -277,7 +278,8 @@ template<typename _MatrixType> class GeneralizedEigenSolver
     EigenvectorsType m_eivec;
     ComplexVectorType m_alphas;
     VectorType m_betas;
-    bool m_valuesOkay, m_vectorsOkay;
+    bool m_computeEigenvectors;
+    bool m_isInitialized;
     RealQZ<MatrixType> m_realQZ;
     ComplexVectorType m_tmp;
 };
@@ -292,8 +294,6 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
   using std::abs;
   eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
   Index size = A.cols();
-  m_valuesOkay = false;
-  m_vectorsOkay = false;
   // Reduce to generalized real Schur form:
   // A = Q S Z and B = Q T Z
   m_realQZ.compute(A, B, computeEigenvectors);
@@ -406,10 +406,9 @@ GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixTyp
         i += 2;
       }
     }
-
-    m_valuesOkay = true;
-    m_vectorsOkay = computeEigenvectors;
   }
+  m_computeEigenvectors = computeEigenvectors;
+  m_isInitialized = true;
   return *this;
 }
 
diff --git a/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index d947dac4ebdb80a8d8184f5a30be9372736c61ae..1f21139346e7394b6c522dba3a4368125232fa99 100644
--- a/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -267,7 +267,7 @@ template<typename _MatrixType> class HessenbergDecomposition
 
   private:
 
-    typedef Matrix<Scalar, 1, Size, Options | RowMajor, 1, MaxSize> VectorType;
+    typedef Matrix<Scalar, 1, Size, int(Options) | int(RowMajor), 1, MaxSize> VectorType;
     typedef typename NumTraits<Scalar>::Real RealScalar;
     static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp);
 
diff --git a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 59e59644eb370813b0e996ca6a6ca949897d16e5..14692365ffbf3a53eb67c0d64a961e098d14578b 100644
--- a/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -125,6 +125,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
         : m_eivec(),
           m_eivalues(),
           m_subdiag(),
+          m_hcoeffs(),
           m_info(InvalidInput),
           m_isInitialized(false),
           m_eigenvectorsOk(false)
@@ -147,6 +148,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
         : m_eivec(size, size),
           m_eivalues(size),
           m_subdiag(size > 1 ? size - 1 : 1),
+          m_hcoeffs(size > 1 ? size - 1 : 1),
           m_isInitialized(false),
           m_eigenvectorsOk(false)
     {}
@@ -172,6 +174,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
+        m_hcoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1),
         m_isInitialized(false),
         m_eigenvectorsOk(false)
     {
@@ -378,6 +381,7 @@ template<typename _MatrixType> class SelfAdjointEigenSolver
     EigenvectorsType m_eivec;
     RealVectorType m_eivalues;
     typename TridiagonalizationType::SubDiagonalType m_subdiag;
+    typename TridiagonalizationType::CoeffVectorType m_hcoeffs;
     ComputationInfo m_info;
     bool m_isInitialized;
     bool m_eigenvectorsOk;
@@ -450,7 +454,8 @@ SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
   if(scale==RealScalar(0)) scale = RealScalar(1);
   mat.template triangularView<Lower>() /= scale;
   m_subdiag.resize(n-1);
-  internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors);
+  m_hcoeffs.resize(n-1);
+  internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, computeEigenvectors);
 
   m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
   
diff --git a/Eigen/src/Eigenvalues/Tridiagonalization.h b/Eigen/src/Eigenvalues/Tridiagonalization.h
index 6c8084f76a60283c4576165a901785ed58a911e9..eda82794ab683fdc46c164971ede750cf469fdc0 100644
--- a/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -425,12 +425,13 @@ struct tridiagonalization_inplace_selector;
   *
   * \sa class Tridiagonalization
   */
-template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
+template<typename MatrixType, typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
 EIGEN_DEVICE_FUNC
-void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag,
+                                CoeffVectorType& hcoeffs, bool extractQ)
 {
   eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
-  tridiagonalization_inplace_selector<MatrixType>::run(mat, diag, subdiag, extractQ);
+  tridiagonalization_inplace_selector<MatrixType>::run(mat, diag, subdiag, hcoeffs, extractQ);
 }
 
 /** \internal
@@ -439,14 +440,12 @@ void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonal
 template<typename MatrixType, int Size, bool IsComplex>
 struct tridiagonalization_inplace_selector
 {
-  typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
-  template<typename DiagonalType, typename SubDiagonalType>
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
   static EIGEN_DEVICE_FUNC
-  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+      void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType& hCoeffs, bool extractQ)
   {
-    CoeffVectorType hCoeffs(mat.cols()-1);
-    tridiagonalization_inplace(mat,hCoeffs);
+    tridiagonalization_inplace(mat, hCoeffs);
     diag = mat.diagonal().real();
     subdiag = mat.template diagonal<-1>().real();
     if(extractQ)
@@ -466,8 +465,8 @@ struct tridiagonalization_inplace_selector<MatrixType,3,false>
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
 
-  template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
+  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&, bool extractQ)
   {
     using std::sqrt;
     const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
@@ -511,9 +510,9 @@ struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
 {
   typedef typename MatrixType::Scalar Scalar;
 
-  template<typename DiagonalType, typename SubDiagonalType>
+  template<typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType>
   static EIGEN_DEVICE_FUNC
-  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
+  void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, CoeffVectorType&, bool extractQ)
   {
     diag(0,0) = numext::real(mat(0,0));
     if(extractQ)
diff --git a/Eigen/src/Geometry/Umeyama.h b/Eigen/src/Geometry/Umeyama.h
index 6b755008fdc5a4e68eec9abfd5752e28b8b56282..2a5c395b25d448dd7f6cdd3591e197f1eecde405 100644
--- a/Eigen/src/Geometry/Umeyama.h
+++ b/Eigen/src/Geometry/Umeyama.h
@@ -136,8 +136,10 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
   // Eq. (39)
   VectorType S = VectorType::Ones(m);
 
-  if  ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 )
-    S(m-1) = -1;
+  if  ( svd.matrixU().determinant() * svd.matrixV().determinant() < 0 ) {
+    Index tmp = m - 1;  
+    S(tmp) = -1;
+  }
 
   // Eq. (40) and (43)
   Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
diff --git a/Eigen/src/Geometry/arch/Geometry_SIMD.h b/Eigen/src/Geometry/arch/Geometry_SIMD.h
index 9c15bfb98b4ba732893e75022f7aa5b2f0c7526a..9af6a9af720d8177759139b33f8f7db80e39f6fa 100644
--- a/Eigen/src/Geometry/arch/Geometry_SIMD.h
+++ b/Eigen/src/Geometry/arch/Geometry_SIMD.h
@@ -28,8 +28,9 @@ struct quat_product<Architecture::Target, Derived, OtherDerived, float>
     evaluator<typename Derived::Coefficients> ae(_a.coeffs());
     evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
     Quaternion<float> res;
-    float arr[4] = {0.f, 0.f, 0.f, -0.f};
-    const Packet4f mask = pset<Packet4f>(arr);
+    const float neg_zero = numext::bit_cast<float>(0x80000000u);
+    const float arr[4] = {0.f, 0.f, 0.f, neg_zero};
+    const Packet4f mask = ploadu<Packet4f>(arr);
     Packet4f a = ae.template packet<AAlignment,Packet4f>(0);
     Packet4f b = be.template packet<BAlignment,Packet4f>(0);
     Packet4f s1 = pmul(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2));
@@ -55,8 +56,9 @@ struct quat_conj<Architecture::Target, Derived, float>
   {
     evaluator<typename Derived::Coefficients> qe(q.coeffs());
     Quaternion<float> res;
-    float arr[4] = {-0.f,-0.f,-0.f,0.f};
-    const Packet4f mask = pset<Packet4f>(arr);
+    const float neg_zero = numext::bit_cast<float>(0x80000000u);
+    const float arr[4] = {neg_zero, neg_zero, neg_zero,0.f};
+    const Packet4f mask = ploadu<Packet4f>(arr);
     pstoret<float,Packet4f,ResAlignment>(&res.x(), pxor(mask, qe.template packet<traits<Derived>::Alignment,Packet4f>(0)));
     return res;
   }
@@ -146,10 +148,11 @@ struct quat_conj<Architecture::Target, Derived, double>
   {
     evaluator<typename Derived::Coefficients> qe(q.coeffs());
     Quaternion<double> res;
-    double arr1[2] = {-0.0, -0.0};
-    double arr2[2] = {-0.0,  0.0};
-    const Packet2d mask0 = pset<Packet2d>(arr1);
-    const Packet2d mask2 = pset<Packet2d>(arr2);
+    const double neg_zero = numext::bit_cast<double>(0x8000000000000000ull);
+    const double arr1[2] = {neg_zero, neg_zero};
+    const double arr2[2] = {neg_zero,  0.0};
+    const Packet2d mask0 = ploadu<Packet2d>(arr1);
+    const Packet2d mask2 = ploadu<Packet2d>(arr2);
     pstoret<double,Packet2d,ResAlignment>(&res.x(), pxor(mask0, qe.template packet<traits<Derived>::Alignment,Packet2d>(0)));
     pstoret<double,Packet2d,ResAlignment>(&res.z(), pxor(mask2, qe.template packet<traits<Derived>::Alignment,Packet2d>(2)));
     return res;
diff --git a/Eigen/src/Householder/Householder.h b/Eigen/src/Householder/Householder.h
index 5bc037f00d18c992606fed9fef11f989fec373d5..d8984a347f6e6bb10a9d9f06b80315aac7b207e7 100644
--- a/Eigen/src/Householder/Householder.h
+++ b/Eigen/src/Householder/Householder.h
@@ -69,7 +69,7 @@ void MatrixBase<Derived>::makeHouseholder(
   Scalar& tau,
   RealScalar& beta) const
 {
-  using std::sqrt;
+  using numext::sqrt;
   using numext::conj;
   
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart)
diff --git a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 153acef65ba921c1ecec39ff670ba2c32b1dfef6..1c9ade5623f9a9ad9c3ed4065073935a81e49441 100644
--- a/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -49,9 +49,9 @@ bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
     x.setZero();
     return true;
   }
-  Scalar rho    = 1;
-  Scalar alpha  = 1;
-  Scalar w      = 1;
+  Scalar rho    (1);
+  Scalar alpha  (1);
+  Scalar w      (1);
   
   VectorType v = VectorType::Zero(n), p = VectorType::Zero(n);
   VectorType y(n),  z(n);
diff --git a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 5d8c6b4339ee592b24f6092cfa67a53c51a2a0da..c3ca0ad54c0d0967dc5c3bd399f00a44fa9bc1b9 100644
--- a/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -29,8 +29,6 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
                         const Preconditioner& precond, Index& iters,
                         typename Dest::RealScalar& tol_error)
 {
-  using std::sqrt;
-  using std::abs;
   typedef typename Dest::RealScalar RealScalar;
   typedef typename Dest::Scalar Scalar;
   typedef Matrix<Scalar,Dynamic,1> VectorType;
@@ -56,7 +54,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
   if (residualNorm2 < threshold)
   {
     iters = 0;
-    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    tol_error = numext::sqrt(residualNorm2 / rhsNorm2);
     return;
   }
 
@@ -86,7 +84,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
     p = z + beta * p;                           // update search direction
     i++;
   }
-  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  tol_error = numext::sqrt(residualNorm2 / rhsNorm2);
   iters = i;
 }
 
diff --git a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
index 7803fd8170f2f3772fa57abc22726a236026d9ba..5e632c4e2f3555d72569eba6de97d8631747199a 100644
--- a/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
+++ b/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -160,13 +160,13 @@ class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar,_Up
     }
 
     /** \returns the sparse lower triangular factor L */
-    const FactorType& matrixL() const { eigen_assert("m_factorizationIsOk"); return m_L; }
+    const FactorType& matrixL() const { eigen_assert(m_factorizationIsOk && "factorize() should be called first"); return m_L; }
 
     /** \returns a vector representing the scaling factor S */
-    const VectorRx& scalingS() const { eigen_assert("m_factorizationIsOk"); return m_scale; }
+    const VectorRx& scalingS() const { eigen_assert(m_factorizationIsOk && "factorize() should be called first"); return m_scale; }
 
     /** \returns the fill-in reducing permutation P (can be empty for a natural ordering) */
-    const PermutationType& permutationP() const { eigen_assert("m_analysisIsOk"); return m_perm; }
+    const PermutationType& permutationP() const { eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); return m_perm; }
 
   protected:
     FactorType m_L;              // The lower part stored in CSC
diff --git a/Eigen/src/LU/InverseImpl.h b/Eigen/src/LU/InverseImpl.h
index 1bab00c012bead01cba6bb29c76a61a8c1155109..a40cefa9e7dcd4cbcc5e42dcb8d8e5fe79be6e97 100644
--- a/Eigen/src/LU/InverseImpl.h
+++ b/Eigen/src/LU/InverseImpl.h
@@ -77,10 +77,11 @@ inline void compute_inverse_size2_helper(
     const MatrixType& matrix, const typename ResultType::Scalar& invdet,
     ResultType& result)
 {
+  typename ResultType::Scalar temp = matrix.coeff(0,0);
   result.coeffRef(0,0) =  matrix.coeff(1,1) * invdet;
   result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet;
   result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet;
-  result.coeffRef(1,1) =  matrix.coeff(0,0) * invdet;
+  result.coeffRef(1,1) =  temp * invdet;
 }
 
 template<typename MatrixType, typename ResultType>
@@ -143,13 +144,18 @@ inline void compute_inverse_size3_helper(
     const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0,
     ResultType& result)
 {
-  result.row(0) = cofactors_col0 * invdet;
-  result.coeffRef(1,0) =  cofactor_3x3<MatrixType,0,1>(matrix) * invdet;
-  result.coeffRef(1,1) =  cofactor_3x3<MatrixType,1,1>(matrix) * invdet;
+  // Compute cofactors in a way that avoids aliasing issues.
+  typedef typename ResultType::Scalar Scalar;
+  const Scalar c01 = cofactor_3x3<MatrixType,0,1>(matrix) * invdet;
+  const Scalar c11 = cofactor_3x3<MatrixType,1,1>(matrix) * invdet;
+  const Scalar c02 = cofactor_3x3<MatrixType,0,2>(matrix) * invdet;
   result.coeffRef(1,2) =  cofactor_3x3<MatrixType,2,1>(matrix) * invdet;
-  result.coeffRef(2,0) =  cofactor_3x3<MatrixType,0,2>(matrix) * invdet;
   result.coeffRef(2,1) =  cofactor_3x3<MatrixType,1,2>(matrix) * invdet;
   result.coeffRef(2,2) =  cofactor_3x3<MatrixType,2,2>(matrix) * invdet;
+  result.coeffRef(1,0) =  c01;
+  result.coeffRef(1,1) =  c11;
+  result.coeffRef(2,0) =  c02;  
+  result.row(0) = cofactors_col0 * invdet;
 }
 
 template<typename MatrixType, typename ResultType>
@@ -181,14 +187,13 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
     bool& invertible
   )
   {
-    using std::abs;
     typedef typename ResultType::Scalar Scalar;
     Matrix<Scalar,3,1> cofactors_col0;
     cofactors_col0.coeffRef(0) =  cofactor_3x3<MatrixType,0,0>(matrix);
     cofactors_col0.coeffRef(1) =  cofactor_3x3<MatrixType,1,0>(matrix);
     cofactors_col0.coeffRef(2) =  cofactor_3x3<MatrixType,2,0>(matrix);
     determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
-    invertible = abs(determinant) > absDeterminantThreshold;
+    invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold;
     if(!invertible) return;
     const Scalar invdet = Scalar(1) / determinant;
     compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse);
@@ -273,7 +278,13 @@ struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
     using std::abs;
     determinant = matrix.determinant();
     invertible = abs(determinant) > absDeterminantThreshold;
-    if(invertible) compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
+    if(invertible && extract_data(matrix) != extract_data(inverse)) {
+      compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
+    }
+    else if(invertible) {
+      MatrixType matrix_t = matrix;
+      compute_inverse<MatrixType, ResultType>::run(matrix_t, inverse);
+    }
   }
 };
 
@@ -347,6 +358,8 @@ inline const Inverse<Derived> MatrixBase<Derived>::inverse() const
   *
   * This is only for fixed-size square matrices of size up to 4x4.
   *
+  * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+  *
   * \param inverse Reference to the matrix in which to store the inverse.
   * \param determinant Reference to the variable in which to store the determinant.
   * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
@@ -387,6 +400,8 @@ inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(
   *
   * This is only for fixed-size square matrices of size up to 4x4.
   *
+  * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+  *
   * \param inverse Reference to the matrix in which to store the inverse.
   * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
   * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
diff --git a/Eigen/src/LU/PartialPivLU.h b/Eigen/src/LU/PartialPivLU.h
index 46ffdd3202a30b795a869a1a9a396098f676b727..34aed72494d4315d7ba22950445c26fc27b2fbae 100644
--- a/Eigen/src/LU/PartialPivLU.h
+++ b/Eigen/src/LU/PartialPivLU.h
@@ -504,8 +504,13 @@ struct partial_lu_impl
 template<typename MatrixType, typename TranspositionType>
 void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::StorageIndex& nb_transpositions)
 {
+  // Special-case of zero matrix.
+  if (lu.rows() == 0 || lu.cols() == 0) {
+    nb_transpositions = 0;
+    return;
+  }
   eigen_assert(lu.cols() == row_transpositions.size());
-  eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
+  eigen_assert(row_transpositions.size() < 2 || (&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
 
   partial_lu_impl
     < typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor,
diff --git a/Eigen/src/LU/arch/InverseSize4.h b/Eigen/src/LU/arch/InverseSize4.h
index 5a8d0c1145194736849858d546090c849d695270..22ae38ac6870ae8694bcad5e4fe7a4d507b791df 100644
--- a/Eigen/src/LU/arch/InverseSize4.h
+++ b/Eigen/src/LU/arch/InverseSize4.h
@@ -35,6 +35,13 @@
 #ifndef EIGEN_INVERSE_SIZE_4_H
 #define EIGEN_INVERSE_SIZE_4_H
 
+#if EIGEN_COMP_GNUC_STRICT
+// These routines requires bit manipulation of the sign, which is not compatible
+// with fastmath.
+#pragma GCC push_options
+#pragma GCC optimize ("no-fast-math")
+#endif
+
 namespace Eigen
 {
 namespace internal
@@ -54,10 +61,12 @@ struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType
   {
     ActualMatrixType matrix(mat);
 
-    Packet4f _L1 = matrix.template packet<MatrixAlignment>(0);
-    Packet4f _L2 = matrix.template packet<MatrixAlignment>(4);
-    Packet4f _L3 = matrix.template packet<MatrixAlignment>(8);
-    Packet4f _L4 = matrix.template packet<MatrixAlignment>(12);
+    const float* data = matrix.data();
+    const Index stride = matrix.innerStride();
+    Packet4f _L1 = ploadt<Packet4f,MatrixAlignment>(data);
+    Packet4f _L2 = ploadt<Packet4f,MatrixAlignment>(data + stride*4);
+    Packet4f _L3 = ploadt<Packet4f,MatrixAlignment>(data + stride*8);
+    Packet4f _L4 = ploadt<Packet4f,MatrixAlignment>(data + stride*12);
 
     // Four 2x2 sub-matrices of the input matrix
     // input = [[A, B],
@@ -141,8 +150,8 @@ struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType
     iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
     iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
 
-    const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
-    const Packet4f p4f_sign_PNNP = pset<Packet4f>(sign_mask);
+    EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
+    const Packet4f p4f_sign_PNNP = pload<Packet4f>(sign_mask);
     rd = pxor(rd, p4f_sign_PNNP);
     iA = pmul(iA, rd);
     iB = pmul(iB, rd);
@@ -189,25 +198,26 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp
 
     Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
 
+    const double* data = matrix.data();
+    const Index stride = matrix.innerStride();
     if (StorageOrdersMatch)
     {
-      A1 = matrix.template packet<MatrixAlignment>(0);
-      B1 = matrix.template packet<MatrixAlignment>(2);
-      A2 = matrix.template packet<MatrixAlignment>(4);
-      B2 = matrix.template packet<MatrixAlignment>(6);
-      C1 = matrix.template packet<MatrixAlignment>(8);
-      D1 = matrix.template packet<MatrixAlignment>(10);
-      C2 = matrix.template packet<MatrixAlignment>(12);
-      D2 = matrix.template packet<MatrixAlignment>(14);
+      A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+      B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+      A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+      B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
+      C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+      D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+      C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+      D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
     }
     else
     {
       Packet2d temp;
-      A1 = matrix.template packet<MatrixAlignment>(0);
-      C1 = matrix.template packet<MatrixAlignment>(2);
-      A2 = matrix.template packet<MatrixAlignment>(4);
-      C2 = matrix.template packet<MatrixAlignment>(6);
-
+      A1 = ploadt<Packet2d,MatrixAlignment>(data + stride*0);
+      C1 = ploadt<Packet2d,MatrixAlignment>(data + stride*2);
+      A2 = ploadt<Packet2d,MatrixAlignment>(data + stride*4);
+      C2 = ploadt<Packet2d,MatrixAlignment>(data + stride*6);
       temp = A1;
       A1 = vec2d_unpacklo(A1, A2);
       A2 = vec2d_unpackhi(temp, A2);
@@ -216,10 +226,10 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp
       C1 = vec2d_unpacklo(C1, C2);
       C2 = vec2d_unpackhi(temp, C2);
 
-      B1 = matrix.template packet<MatrixAlignment>(8);
-      D1 = matrix.template packet<MatrixAlignment>(10);
-      B2 = matrix.template packet<MatrixAlignment>(12);
-      D2 = matrix.template packet<MatrixAlignment>(14);
+      B1 = ploadt<Packet2d,MatrixAlignment>(data + stride*8);
+      D1 = ploadt<Packet2d,MatrixAlignment>(data + stride*10);
+      B2 = ploadt<Packet2d,MatrixAlignment>(data + stride*12);
+      D2 = ploadt<Packet2d,MatrixAlignment>(data + stride*14);
 
       temp = B1;
       B1 = vec2d_unpacklo(B1, B2);
@@ -323,10 +333,10 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp
     iC1 = psub(pmul(B1, dC), iC1);
     iC2 = psub(pmul(B2, dC), iC2);
 
-    const double sign_mask1[2] = {0.0, -0.0};
-    const double sign_mask2[2] = {-0.0, 0.0};
-    const Packet2d sign_PN = pset<Packet2d>(sign_mask1);
-    const Packet2d sign_NP = pset<Packet2d>(sign_mask2);
+    EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0};
+    EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0};
+    const Packet2d sign_PN = pload<Packet2d>(sign_mask1);
+    const Packet2d sign_NP = pload<Packet2d>(sign_mask2);
     d1 = pxor(rd, sign_PN);
     d2 = pxor(rd, sign_NP);
 
@@ -345,4 +355,9 @@ struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultTyp
 #endif
 } // namespace internal
 } // namespace Eigen
+
+#if EIGEN_COMP_GNUC_STRICT
+#pragma GCC pop_options
+#endif
+
 #endif
diff --git a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 013c7ae7a9cc0149ab07654f5c1f59f7f1c3d42a..3ceac5ad3d6a50dcc35ec89688cc4b532e553430 100644
--- a/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -258,12 +258,12 @@ class SPQR : public SparseSolverBase<SPQR<_MatrixType> >
     int m_ordering; // Ordering method to use, see SPQR's manual
     int m_allow_tol; // Allow to use some tolerance during numerical factorization.
     RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero
-    mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format
+    mutable cholmod_sparse *m_cR = nullptr; // The sparse R factor in cholmod format
     mutable MatrixType m_R; // The sparse matrix R in Eigen format
-    mutable StorageIndex *m_E; // The permutation applied to columns
-    mutable cholmod_sparse *m_H;  //The householder vectors
-    mutable StorageIndex *m_HPinv; // The row permutation of H
-    mutable cholmod_dense *m_HTau; // The Householder coefficients
+    mutable StorageIndex *m_E = nullptr; // The permutation applied to columns
+    mutable cholmod_sparse *m_H = nullptr;  //The householder vectors
+    mutable StorageIndex *m_HPinv = nullptr; // The row permutation of H
+    mutable cholmod_dense *m_HTau = nullptr; // The Householder coefficients
     mutable Index m_rank; // The rank of the matrix
     mutable cholmod_common m_cc; // Workspace and parameters
     bool m_useDefaultThreshold;     // Use default threshold
diff --git a/Eigen/src/SVD/BDCSVD.h b/Eigen/src/SVD/BDCSVD.h
index 17f8e44364ddeb2ddcf41ede841868f10595504c..79a6562b7cf469a028bfa37fbaf6736dd9f53bd4 100644
--- a/Eigen/src/SVD/BDCSVD.h
+++ b/Eigen/src/SVD/BDCSVD.h
@@ -27,6 +27,10 @@
 #define eigen_internal_assert(X) assert(X);
 #endif
 
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+#include <iostream>
+#endif
+
 namespace Eigen {
 
 #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
@@ -172,7 +176,7 @@ public:
 
   void setSwitchSize(int s) 
   {
-    eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 3");
+    eigen_assert(s>=3 && "BDCSVD the size of the algo switch has to be at least 3.");
     m_algoswap = s;
   }
  
@@ -404,7 +408,7 @@ void BDCSVD<MatrixType>::structured_update(Block<MatrixXr,Dynamic,Dynamic> A, co
 //@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; 
 // lastCol + 1 - firstCol is the size of the submatrix.
 //@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W)
-//@param firstRowW : Same as firstRowW with the column.
+//@param firstColW : Same as firstRowW with the column.
 //@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
 // to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
 template<typename MatrixType>
@@ -899,7 +903,7 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
       RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
       eigen_internal_assert(fLeft<Literal(0));
 
-#if defined EIGEN_INTERNAL_DEBUGGING || defined EIGEN_BDCSVD_SANITY_CHECKS
+#if defined EIGEN_BDCSVD_DEBUG_VERBOSE || defined EIGEN_BDCSVD_SANITY_CHECKS || defined EIGEN_INTERNAL_DEBUGGING
       RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
 #endif
 
@@ -974,8 +978,8 @@ void BDCSVD<MatrixType>::computeSingVals(const ArrayRef& col0, const ArrayRef& d
     // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
     // (deflation is supposed to avoid this from happening)
     // - this does no seem to be necessary anymore -
-//     if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
-//     if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
+    // if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
+    // if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
   }
 }
 
@@ -1029,7 +1033,14 @@ void BDCSVD<MatrixType>::perturbCol0
             std::cout << "  " << "j=" << j << "\n";
           }
 #endif
-          Index j = i<k ? i : perm(l-1);
+          // Avoid index out of bounds.
+          // Will end up setting zhat(k) = 0.
+          if (i >= k && l == 0) {
+            m_info = NumericalIssue;
+            prod = 0;
+            break;
+          }
+          Index j = i<k ? i : l > 0 ? perm(l-1) : i;
 #ifdef EIGEN_BDCSVD_SANITY_CHECKS
           if(!(dk!=Literal(0) || diag(i)!=Literal(0)))
           {
@@ -1242,8 +1253,8 @@ void BDCSVD<MatrixType>::deflation(Eigen::Index firstCol, Eigen::Index lastCol,
 #endif
   {
     // Check for total deflation
-    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting
-    bool total_deflation = (col0.tail(length-1).array()<considerZero).all();
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
+    const bool total_deflation = (col0.tail(length-1).array().abs()<considerZero).all();
     
     // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
     // First, compute the respective permutation.
diff --git a/Eigen/src/SVD/JacobiSVD.h b/Eigen/src/SVD/JacobiSVD.h
index a22a2e5c35651e8c2896daed6c5d688399235e44..4b002ad4c5021901392f5f514000cd7853ddfc53 100644
--- a/Eigen/src/SVD/JacobiSVD.h
+++ b/Eigen/src/SVD/JacobiSVD.h
@@ -112,12 +112,12 @@ public:
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor))
-              : ColsAtCompileTime==1 ? (MatrixType::Options |   RowMajor)
-              : MatrixType::Options
+    Options = MatrixType::Options
   };
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+
+  typedef typename internal::make_proper_matrix_type<
+    Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime
+  >::type TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
   {
@@ -202,13 +202,12 @@ public:
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor))
-              : ColsAtCompileTime==1 ? (MatrixType::Options |   RowMajor)
-              : MatrixType::Options
+    Options = MatrixType::Options
   };
 
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, TrOptions, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+  typedef typename internal::make_proper_matrix_type<
+    Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime
+  >::type TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
   {
@@ -303,8 +302,9 @@ public:
     Options = MatrixType::Options
   };
 
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+  typedef typename internal::make_proper_matrix_type<
+    Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime
+  >::type TransposeTypeWithSameStorageOrder;
 
   void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
   {
@@ -680,6 +680,7 @@ JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsig
   if (!(numext::isfinite)(scale)) {
     m_isInitialized = true;
     m_info = InvalidInput;
+    m_nonzeroSingularValues = 0;
     return *this;
   }
   if(scale==RealScalar(0)) scale = RealScalar(1);
diff --git a/Eigen/src/SVD/UpperBidiagonalization.h b/Eigen/src/SVD/UpperBidiagonalization.h
index 997defc474419f5428529010054cfbed9b93bb96..a5b2f60d2122f7fedcc2bc4f1ce28a98828ec2f5 100644
--- a/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/Eigen/src/SVD/UpperBidiagonalization.h
@@ -161,13 +161,14 @@ void upperbidiagonalization_blocked_helper(MatrixType& A,
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   typedef typename NumTraits<RealScalar>::Literal Literal;
-  enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
-  typedef InnerStride<int(StorageOrder) == int(ColMajor) ? 1 : Dynamic> ColInnerStride;
-  typedef InnerStride<int(StorageOrder) == int(ColMajor) ? Dynamic : 1> RowInnerStride;
+  static const int StorageOrder =
+      (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor;
+  typedef InnerStride<StorageOrder == ColMajor ? 1 : Dynamic> ColInnerStride;
+  typedef InnerStride<StorageOrder == ColMajor ? Dynamic : 1> RowInnerStride;
   typedef Ref<Matrix<Scalar, Dynamic, 1>, 0, ColInnerStride>    SubColumnType;
   typedef Ref<Matrix<Scalar, 1, Dynamic>, 0, RowInnerStride>    SubRowType;
   typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder > > SubMatType;
-  
+
   Index brows = A.rows();
   Index bcols = A.cols();
 
@@ -293,7 +294,7 @@ void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagona
   Index size = (std::min)(rows, cols);
 
   // X and Y are work space
-  enum { StorageOrder = traits<MatrixType>::Flags & RowMajorBit };
+  enum { StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor };
   Matrix<Scalar,
          MatrixType::RowsAtCompileTime,
          Dynamic,
diff --git a/Eigen/src/SparseCholesky/SimplicialCholesky.h b/Eigen/src/SparseCholesky/SimplicialCholesky.h
index 94c9f0f21b433e2f1218e11093efd2ad8fbac4e5..9f93e3255dac5e9dc591962351e4d27262fb50e2 100644
--- a/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -218,7 +218,7 @@ class SimplicialCholeskyBase : public SparseSolverBase<Derived>
       CholMatrixType tmp(size,size);
       ConstCholMatrixPtr pmat;
       
-      if(m_P.size()==0 && (UpLo&Upper)==Upper)
+      if(m_P.size() == 0 && (int(UpLo) & int(Upper)) == Upper)
       {
         // If there is no ordering, try to directly use the input matrix without any copy
         internal::simplicial_cholesky_grab_input<CholMatrixType,MatrixType>::run(a, pmat, tmp);
diff --git a/Eigen/src/SparseCore/SparseBlock.h b/Eigen/src/SparseCore/SparseBlock.h
index 5b4f6cc9f3f81e4857ba64021c4c34a8e0f23af9..c16caec704e66e0530f48a1d209f2009f28f979f 100644
--- a/Eigen/src/SparseCore/SparseBlock.h
+++ b/Eigen/src/SparseCore/SparseBlock.h
@@ -429,12 +429,7 @@ struct unary_evaluator<Block<ArgType,BlockRows,BlockCols,InnerPanel>, IteratorBa
 
     enum {
       IsRowMajor = XprType::IsRowMajor,
-
-      OuterVector =  (BlockCols==1 && ArgType::IsRowMajor)
-                    | // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                      // revert to || as soon as not needed anymore.
-                     (BlockRows==1 && !ArgType::IsRowMajor),
-
+      OuterVector = (BlockCols == 1 && ArgType::IsRowMajor) || (BlockRows == 1 && !ArgType::IsRowMajor),
       CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
       Flags = XprType::Flags
     };
diff --git a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 6130bab430be7dbd3f6953b0006248a80a52399d..9b0d3f98dcd7c9ad9d446d65bc3de39e7b03c90a 100644
--- a/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -126,7 +126,7 @@ public:
   
   
   enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
     Flags = XprType::Flags
   };
   
@@ -211,7 +211,7 @@ public:
 
 
   enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
     Flags = XprType::Flags
   };
 
@@ -298,7 +298,7 @@ public:
 
 
   enum {
-    CoeffReadCost = evaluator<Lhs>::CoeffReadCost + evaluator<Rhs>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
     Flags = XprType::Flags
   };
 
@@ -457,7 +457,7 @@ public:
   
   
   enum {
-    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
     Flags = XprType::Flags
   };
   
@@ -530,7 +530,7 @@ public:
   
   
   enum {
-    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
     Flags = XprType::Flags
   };
   
@@ -604,7 +604,7 @@ public:
   
   
   enum {
-    CoeffReadCost = evaluator<LhsArg>::CoeffReadCost + evaluator<RhsArg>::CoeffReadCost + functor_traits<BinaryOp>::Cost,
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
     Flags = XprType::Flags
   };
   
diff --git a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index df6c28d2b89be903c1e2b98e5a38d6b8fd6ff06b..32dac0f7863eb8b572c1fd4bf21752755a034aaf 100644
--- a/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -24,7 +24,7 @@ struct unary_evaluator<CwiseUnaryOp<UnaryOp,ArgType>, IteratorBased>
     class InnerIterator;
     
     enum {
-      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<UnaryOp>::Cost,
+      CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
       Flags = XprType::Flags
     };
     
@@ -79,7 +79,7 @@ struct unary_evaluator<CwiseUnaryView<ViewOp,ArgType>, IteratorBased>
     class InnerIterator;
     
     enum {
-      CoeffReadCost = evaluator<ArgType>::CoeffReadCost + functor_traits<ViewOp>::Cost,
+      CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<ViewOp>::Cost),
       Flags = XprType::Flags
     };
     
diff --git a/Eigen/src/SparseCore/SparseMap.h b/Eigen/src/SparseCore/SparseMap.h
index f99be3379db2ac32d9cc4dd18f86c3ecf138998a..6247d79bd9a525d9e26f80eac20ecafcafaa227e 100644
--- a/Eigen/src/SparseCore/SparseMap.h
+++ b/Eigen/src/SparseCore/SparseMap.h
@@ -237,6 +237,7 @@ class Map<SparseMatrixType>
     /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients,
       * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr.
       * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed.
+      * The inner indices must be sorted appropriately.
       *
       * This constructor is available only if \c SparseMatrixType is non-const.
       *
diff --git a/Eigen/src/SparseCore/SparseMatrix.h b/Eigen/src/SparseCore/SparseMatrix.h
index 616b4a0c24e7b7b2e725bd4a3e7b62388a00e2a4..9fc06b5e712f440cb5746f53e79b7601602c2733 100644
--- a/Eigen/src/SparseCore/SparseMatrix.h
+++ b/Eigen/src/SparseCore/SparseMatrix.h
@@ -793,6 +793,7 @@ class SparseMatrix
     template<typename OtherDerived>
     EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
 
+#ifndef EIGEN_NO_IO
     friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m)
     {
       EIGEN_DBG_SPARSE(
@@ -837,6 +838,7 @@ class SparseMatrix
       s << static_cast<const SparseMatrixBase<SparseMatrix>&>(m);
       return s;
     }
+#endif
 
     /** Destructor */
     inline ~SparseMatrix()
diff --git a/Eigen/src/SparseCore/SparseMatrixBase.h b/Eigen/src/SparseCore/SparseMatrixBase.h
index 229449f02274463f3242ea2283faf3a93ab006cb..417a2365e04173301ae64fb257db2637c00d6431 100644
--- a/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -113,7 +113,7 @@ template<typename Derived> class SparseMatrixBase
                         Transpose<const Derived>
                      >::type AdjointReturnType;
     typedef Transpose<Derived> TransposeReturnType;
-    typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
+    typedef Transpose<const Derived> ConstTransposeReturnType;
 
     // FIXME storage order do not match evaluator storage order
     typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, StorageIndex> PlainObject;
@@ -214,7 +214,7 @@ template<typename Derived> class SparseMatrixBase
     inline void assignGeneric(const OtherDerived& other);
 
   public:
-
+#ifndef EIGEN_NO_IO
     friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m)
     {
       typedef typename Derived::Nested Nested;
@@ -263,6 +263,7 @@ template<typename Derived> class SparseMatrixBase
       }
       return s;
     }
+#endif
 
     template<typename OtherDerived>
     Derived& operator+=(const SparseMatrixBase<OtherDerived>& other);
diff --git a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index 88820a48f364a8a7663ad37d0a9505550cdcba44..25ce404b8976d3e2ed83fbb8b499b13c456c682a 100644
--- a/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -90,9 +90,9 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,C
 
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
-    typename remove_all<ResultType>::type _res(res.rows(), res.cols());
-    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, _res, tolerance);
-    res.swap(_res);
+    typename remove_all<ResultType>::type res_(res.rows(), res.cols());
+    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, res_, tolerance);
+    res.swap(res_);
   }
 };
 
@@ -104,9 +104,9 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,C
   {
     // we need a col-major matrix to hold the result
     typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::StorageIndex> SparseTemporaryType;
-    SparseTemporaryType _res(res.rows(), res.cols());
-    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, _res, tolerance);
-    res = _res;
+    SparseTemporaryType res_(res.rows(), res.cols());
+    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, res_, tolerance);
+    res = res_;
   }
 };
 
@@ -117,9 +117,9 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
   static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
   {
     // let's transpose the product to get a column x column product
-    typename remove_all<ResultType>::type _res(res.rows(), res.cols());
-    internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, _res, tolerance);
-    res.swap(_res);
+    typename remove_all<ResultType>::type res_(res.rows(), res.cols());
+    internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, res_, tolerance);
+    res.swap(res_);
   }
 };
 
@@ -137,9 +137,9 @@ struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,R
 
     // let's transpose the product to get a column x column product
 //     typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
-//     SparseTemporaryType _res(res.cols(), res.rows());
-//     sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, _res);
-//     res = _res.transpose();
+//     SparseTemporaryType res_(res.cols(), res.rows());
+//     sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, res_);
+//     res = res_.transpose();
   }
 };
 
diff --git a/Eigen/src/SparseCore/SparseVector.h b/Eigen/src/SparseCore/SparseVector.h
index 05779be685b8f39d70f761e8064c7f0c2665d7b6..106925be4e51959432990a7d6696a97f4f39763f 100644
--- a/Eigen/src/SparseCore/SparseVector.h
+++ b/Eigen/src/SparseCore/SparseVector.h
@@ -329,6 +329,7 @@ class SparseVector
     }
     #endif
 
+#ifndef EIGEN_NO_IO
     friend std::ostream & operator << (std::ostream & s, const SparseVector& m)
     {
       for (Index i=0; i<m.nonZeros(); ++i)
@@ -336,6 +337,7 @@ class SparseVector
       s << std::endl;
       return s;
     }
+#endif
 
     /** Destructor */
     inline ~SparseVector() {}
diff --git a/Eigen/src/SparseCore/TriangularSolver.h b/Eigen/src/SparseCore/TriangularSolver.h
index f9c56ba79800e209dcf3f18ba37dbb8023488bca..7cb2c2665f0e24924da88f11c0fe3ca0c0af52e3 100644
--- a/Eigen/src/SparseCore/TriangularSolver.h
+++ b/Eigen/src/SparseCore/TriangularSolver.h
@@ -270,11 +270,11 @@ struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
       }
 
 
-      Index count = 0;
+//       Index count = 0;
       // FIXME compute a reference value to filter zeros
       for (typename AmbiVector<Scalar,StorageIndex>::Iterator it(tempVector/*,1e-12*/); it; ++it)
       {
-        ++ count;
+//         ++ count;
 //         std::cerr << "fill " << it.index() << ", " << col << "\n";
 //         std::cout << it.value() << "  ";
         // FIXME use insertBack
diff --git a/Eigen/src/SparseLU/SparseLU.h b/Eigen/src/SparseLU/SparseLU.h
index 0c8d8939be2e21089f6883341b4339d5f4868536..6eb79502fdc38772b190dd5e527ec539dc813222 100644
--- a/Eigen/src/SparseLU/SparseLU.h
+++ b/Eigen/src/SparseLU/SparseLU.h
@@ -35,9 +35,10 @@ public:
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
   };
 
-  SparseLUTransposeView() : m_sparseLU(NULL) {}
-  SparseLUTransposeView(const SparseLUTransposeView& view) {
+  SparseLUTransposeView() : APIBase(), m_sparseLU(NULL) {}
+  SparseLUTransposeView(const SparseLUTransposeView& view) : APIBase() {
     this->m_sparseLU = view.m_sparseLU;
+    this->m_isInitialized = view.m_isInitialized;
   }
   void setIsInitialized(const bool isInitialized) {this->m_isInitialized = isInitialized;}
   void setSparseLU(SparseLUType* sparseLU) {m_sparseLU = sparseLU;}
@@ -752,10 +753,13 @@ void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
       info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu);
       if ( info ) 
       {
-        m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR ... ZERO COLUMN AT ";
+        m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR";
+#ifndef EIGEN_NO_IO
         std::ostringstream returnInfo;
-        returnInfo << info; 
+        returnInfo << " ... ZERO COLUMN AT ";
+        returnInfo << info;
         m_lastError += returnInfo.str();
+#endif
         m_info = NumericalIssue; 
         m_factorizationIsOk = false; 
         return; 
@@ -830,7 +834,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
   {
     Index nrhs = X.cols();
-    Index n    = X.rows();
     // Backward solve with U
     for (Index k = m_mapL.nsuper(); k >= 0; k--)
     {
@@ -850,7 +853,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
       {
         // FIXME: the following lines should use Block expressions and not Map!
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X.coeffRef(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
         U = A.template triangularView<Upper>().solve(U);
       }
 
@@ -873,7 +876,6 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
   {
     using numext::conj;
     Index nrhs = X.cols();
-    Index n    = X.rows();
     // Forward solve with U
     for (Index k = 0; k <=  m_mapL.nsuper(); k++)
     {
@@ -904,7 +906,7 @@ struct SparseLUMatrixUReturnType : internal::no_assignment_operator
       else
       {
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
         if(Conjugate)
           U = A.adjoint().template triangularView<Lower>().solve(U);
         else
diff --git a/Eigen/src/SparseLU/SparseLU_Structs.h b/Eigen/src/SparseLU/SparseLU_Structs.h
index cf5ec449bec3474860b49c3348a90c15cb15fed2..16a0c41f4654c2cf912b7d75346bd7c318b455c9 100644
--- a/Eigen/src/SparseLU/SparseLU_Structs.h
+++ b/Eigen/src/SparseLU/SparseLU_Structs.h
@@ -70,8 +70,8 @@
 #define EIGEN_LU_STRUCTS
 namespace Eigen {
 namespace internal {
-  
-typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; 
+
+enum MemType {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL};
 
 template <typename IndexVector, typename ScalarVector>
 struct LU_GlobalLU_t {
diff --git a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index 0be293d17fa51110a211fb99b22ad3e8c67c4560..fd5e9fa51ce01fe5652295471196b4a1daeb831f 100644
--- a/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -274,9 +274,8 @@ void MappedSuperNodalMatrix<Scalar,Index_>::solveInPlace( MatrixBase<Dest>&X) co
         
         // Triangular solve 
         Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
-        U = A.template triangularView<UnitLower>().solve(U); 
-        
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
+        U = A.template triangularView<UnitLower>().solve(U);        
         // Matrix-vector product 
         new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
         work.topRows(nrow).noalias() = A * U;
@@ -349,7 +348,7 @@ void MappedSuperNodalMatrix<Scalar,Index_>::solveTransposedInPlace( MatrixBase<D
 
       // Matrix-vector product with transposed submatrix
       Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
-      Map< Matrix<Scalar,Dynamic,Dest::ColsAtCompileTime, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+      typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
       if(Conjugate)
         U = U - A.adjoint() * work.topRows(nrow);
       else
diff --git a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
deleted file mode 100644
index e37c2fe0d028482549db8186a6650745ce0f6db7..0000000000000000000000000000000000000000
--- a/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ /dev/null
@@ -1,280 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H
-#define EIGEN_SPARSELU_GEMM_KERNEL_H
-
-namespace Eigen {
-
-namespace internal {
-
-
-/** \internal
-  * A general matrix-matrix product kernel optimized for the SparseLU factorization.
-  *  - A, B, and C must be column major
-  *  - lda and ldc must be multiples of the respective packet size
-  *  - C must have the same alignment as A
-  */
-template<typename Scalar>
-EIGEN_DONT_INLINE
-void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc)
-{
-  using namespace Eigen::internal;
-  
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
-    PacketSize = packet_traits<Scalar>::size,
-    PM = 8,                             // peeling in M
-    RN = 2,                             // register blocking
-    RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking
-    BM = 4096/sizeof(Scalar),           // number of rows of A-C per chunk
-    SM = PM*PacketSize                  // step along M
-  };
-  Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
-  Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
-  Index i0 = internal::first_default_aligned(A,m);
-  
-  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_default_aligned(C,m)));
-  
-  // handle the non aligned rows of A and C without any optimization:
-  for(Index i=0; i<i0; ++i)
-  {
-    for(Index j=0; j<n; ++j)
-    {
-      Scalar c = C[i+j*ldc];
-      for(Index k=0; k<d; ++k)
-        c += B[k+j*ldb] * A[i+k*lda];
-      C[i+j*ldc] = c;
-    }
-  }
-  // process the remaining rows per chunk of BM rows
-  for(Index ib=i0; ib<m; ib+=BM)
-  {
-    Index actual_b = std::min<Index>(BM, m-ib);                 // actual number of rows
-    Index actual_b_end1 = (actual_b/SM)*SM;                   // actual number of rows suitable for peeling
-    Index actual_b_end2 = (actual_b/PacketSize)*PacketSize;   // actual number of rows suitable for vectorization
-    
-    // Let's process two columns of B-C at once
-    for(Index j=0; j<n_end; j+=RN)
-    {
-      const Scalar* Bc0 = B+(j+0)*ldb;
-      const Scalar* Bc1 = B+(j+1)*ldb;
-      
-      for(Index k=0; k<d_end; k+=RK)
-      {
-        
-        // load and expand a RN x RK block of B
-        Packet b00, b10, b20, b30, b01, b11, b21, b31;
-                  { b00 = pset1<Packet>(Bc0[0]); }
-                  { b10 = pset1<Packet>(Bc0[1]); }
-        if(RK==4) { b20 = pset1<Packet>(Bc0[2]); }
-        if(RK==4) { b30 = pset1<Packet>(Bc0[3]); }
-                  { b01 = pset1<Packet>(Bc1[0]); }
-                  { b11 = pset1<Packet>(Bc1[1]); }
-        if(RK==4) { b21 = pset1<Packet>(Bc1[2]); }
-        if(RK==4) { b31 = pset1<Packet>(Bc1[3]); }
-        
-        Packet a0, a1, a2, a3, c0, c1, t0, t1;
-        
-        const Scalar* A0 = A+ib+(k+0)*lda;
-        const Scalar* A1 = A+ib+(k+1)*lda;
-        const Scalar* A2 = A+ib+(k+2)*lda;
-        const Scalar* A3 = A+ib+(k+3)*lda;
-        
-        Scalar* C0 = C+ib+(j+0)*ldc;
-        Scalar* C1 = C+ib+(j+1)*ldc;
-        
-                  a0 = pload<Packet>(A0);
-                  a1 = pload<Packet>(A1);
-        if(RK==4)
-        {
-          a2 = pload<Packet>(A2);
-          a3 = pload<Packet>(A3);
-        }
-        else
-        {
-          // workaround "may be used uninitialized in this function" warning
-          a2 = a3 = a0;
-        }
-        
-#define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}
-#define WORK(I)  \
-                     c0 = pload<Packet>(C0+i+(I)*PacketSize);    \
-                     c1 = pload<Packet>(C1+i+(I)*PacketSize);    \
-                     KMADD(c0, a0, b00, t0)                      \
-                     KMADD(c1, a0, b01, t1)                      \
-                     a0 = pload<Packet>(A0+i+(I+1)*PacketSize);  \
-                     KMADD(c0, a1, b10, t0)                      \
-                     KMADD(c1, a1, b11, t1)                      \
-                     a1 = pload<Packet>(A1+i+(I+1)*PacketSize);  \
-          if(RK==4){ KMADD(c0, a2, b20, t0)                     }\
-          if(RK==4){ KMADD(c1, a2, b21, t1)                     }\
-          if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize); }\
-          if(RK==4){ KMADD(c0, a3, b30, t0)                     }\
-          if(RK==4){ KMADD(c1, a3, b31, t1)                     }\
-          if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize); }\
-                     pstore(C0+i+(I)*PacketSize, c0);            \
-                     pstore(C1+i+(I)*PacketSize, c1)
-        
-        // process rows of A' - C' with aggressive vectorization and peeling 
-        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
-        {
-          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL1");
-                    prefetch((A0+i+(5)*PacketSize));
-                    prefetch((A1+i+(5)*PacketSize));
-          if(RK==4) prefetch((A2+i+(5)*PacketSize));
-          if(RK==4) prefetch((A3+i+(5)*PacketSize));
-
-          WORK(0);
-          WORK(1);
-          WORK(2);
-          WORK(3);
-          WORK(4);
-          WORK(5);
-          WORK(6);
-          WORK(7);
-        }
-        // process the remaining rows with vectorization only
-        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
-        {
-          WORK(0);
-        }
-#undef WORK
-        // process the remaining rows without vectorization
-        for(Index i=actual_b_end2; i<actual_b; ++i)
-        {
-          if(RK==4)
-          {
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
-            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]+A2[i]*Bc1[2]+A3[i]*Bc1[3];
-          }
-          else
-          {
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
-            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1];
-          }
-        }
-        
-        Bc0 += RK;
-        Bc1 += RK;
-      } // peeled loop on k
-    } // peeled loop on the columns j
-    // process the last column (we now perform a matrix-vector product)
-    if((n-n_end)>0)
-    {
-      const Scalar* Bc0 = B+(n-1)*ldb;
-      
-      for(Index k=0; k<d_end; k+=RK)
-      {
-        
-        // load and expand a 1 x RK block of B
-        Packet b00, b10, b20, b30;
-                  b00 = pset1<Packet>(Bc0[0]);
-                  b10 = pset1<Packet>(Bc0[1]);
-        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
-        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
-        
-        Packet a0, a1, a2, a3, c0, t0/*, t1*/;
-        
-        const Scalar* A0 = A+ib+(k+0)*lda;
-        const Scalar* A1 = A+ib+(k+1)*lda;
-        const Scalar* A2 = A+ib+(k+2)*lda;
-        const Scalar* A3 = A+ib+(k+3)*lda;
-        
-        Scalar* C0 = C+ib+(n_end)*ldc;
-        
-                  a0 = pload<Packet>(A0);
-                  a1 = pload<Packet>(A1);
-        if(RK==4)
-        {
-          a2 = pload<Packet>(A2);
-          a3 = pload<Packet>(A3);
-        }
-        else
-        {
-          // workaround "may be used uninitialized in this function" warning
-          a2 = a3 = a0;
-        }
-        
-#define WORK(I) \
-                   c0 = pload<Packet>(C0+i+(I)*PacketSize);     \
-                   KMADD(c0, a0, b00, t0)                       \
-                   a0 = pload<Packet>(A0+i+(I+1)*PacketSize);   \
-                   KMADD(c0, a1, b10, t0)                       \
-                   a1 = pload<Packet>(A1+i+(I+1)*PacketSize);   \
-        if(RK==4){ KMADD(c0, a2, b20, t0)                      }\
-        if(RK==4){ a2 = pload<Packet>(A2+i+(I+1)*PacketSize);  }\
-        if(RK==4){ KMADD(c0, a3, b30, t0)                      }\
-        if(RK==4){ a3 = pload<Packet>(A3+i+(I+1)*PacketSize);  }\
-                   pstore(C0+i+(I)*PacketSize, c0);
-        
-        // aggressive vectorization and peeling
-        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
-        {
-          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
-          WORK(0);
-          WORK(1);
-          WORK(2);
-          WORK(3);
-          WORK(4);
-          WORK(5);
-          WORK(6);
-          WORK(7);
-        }
-        // vectorization only
-        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
-        {
-          WORK(0);
-        }
-        // remaining scalars
-        for(Index i=actual_b_end2; i<actual_b; ++i)
-        {
-          if(RK==4) 
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
-          else
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
-        }
-        
-        Bc0 += RK;
-#undef WORK
-      }
-    }
-    
-    // process the last columns of A, corresponding to the last rows of B
-    Index rd = d-d_end;
-    if(rd>0)
-    {
-      for(Index j=0; j<n; ++j)
-      {
-        enum {
-          Alignment = PacketSize>1 ? Aligned : 0
-        };
-        typedef Map<Matrix<Scalar,Dynamic,1>, Alignment > MapVector;
-        typedef Map<const Matrix<Scalar,Dynamic,1>, Alignment > ConstMapVector;
-        if(rd==1)       MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b);
-        
-        else if(rd==2)  MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
-                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b);
-        
-        else            MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
-                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b)
-                                                        + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b);
-      }
-    }
-  
-  } // blocking on the rows of A and C
-}
-#undef KMADD
-
-} // namespace internal
-
-} // namespace Eigen
-
-#endif // EIGEN_SPARSELU_GEMM_KERNEL_H
diff --git a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
index 6f75d500e5f831f414175ce46dbceffa0acd5539..7aecbcad8ed2703000d62cfd5d88d983c69a7423 100644
--- a/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
+++ b/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
@@ -75,8 +75,6 @@ void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVe
   // Identify the relaxed supernodes by postorder traversal of the etree
   Index snode_start; // beginning of a snode 
   StorageIndex k;
-  Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree 
-  Index nsuper_et = 0; // Number of relaxed snodes in the original etree 
   StorageIndex l; 
   for (j = 0; j < n; )
   {
@@ -88,7 +86,6 @@ void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVe
       parent = et(j);
     }
     // Found a supernode in postordered etree, j is the last column 
-    ++nsuper_et_post;
     k = StorageIndex(n);
     for (Index i = snode_start; i <= j; ++i)
       k = (std::min)(k, inv_post(i));
@@ -97,7 +94,6 @@ void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVe
     {
       // This is also a supernode in the original etree
       relax_end(k) = l; // Record last column 
-      ++nsuper_et; 
     }
     else 
     {
@@ -107,7 +103,6 @@ void SparseLUImpl<Scalar,StorageIndex>::heap_relax_snode (const Index n, IndexVe
         if (descendants(i) == 0) 
         {
           relax_end(l) = l;
-          ++nsuper_et;
         }
       }
     }
diff --git a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
index 8c1b3e8bc67c89ea80b81f22691695cf6cebf90c..7a101ea0c862bf389c0aaf0958379eb598a7e92e 100644
--- a/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@@ -69,8 +69,7 @@ EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index seg
   Index aligned_with_B_offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize))%PacketSize;
   Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
   
-  l.setZero();
-  internal::sparselu_gemm<Scalar>(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride());
+  l.noalias() = B * u;
   
   // Scatter tempv[] into SPA dense[] as a temporary storage 
   isub = lptr + no_zeros;
diff --git a/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index f052001c8f8880c17909959687bd2be4a97ae02a..92cdb0e45c457fcbb44ec0efe32888b4cef51f5e 100644
--- a/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -148,8 +148,7 @@ void SparseLUImpl<Scalar,StorageIndex>::panel_bmod(const Index m, const Index w,
       Index offset = (PacketSize-internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
       MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
       
-      L.setZero();
-      internal::sparselu_gemm<Scalar>(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride());
+      L.noalias() = B * U;
       
       // scatter U and L
       u_col = 0;
diff --git a/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
index 0e5d5445b18fcda01a43e33e958a9a964b8d1354..1b422e2015aab9ced8e029d789615fd1922512bc 100644
--- a/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseBinaryOps.h
@@ -30,15 +30,53 @@ operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   *
   * \sa max()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(min,min)
+template <int NaNPropagation, typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const OtherDerived &other) const
+{
+  return (min<PropagateFast>)(other);
+}
 
 /** \returns an expression of the coefficient-wise min of \c *this and scalar \a other
   *
   * \sa max()
   */
+template <int NaNPropagation>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived,
-                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived,
+    const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+min
+#else
+(min)
+#endif
+(const Scalar &other) const
+{
+  return (min<NaNPropagation>)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived,
+    const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 min
 #else
@@ -46,7 +84,7 @@ min
 #endif
 (const Scalar &other) const
 {
-  return (min)(Derived::PlainObject::Constant(rows(), cols(), other));
+  return (min<PropagateFast>)(Derived::PlainObject::Constant(rows(), cols(), other));
 }
 
 /** \returns an expression of the coefficient-wise max of \c *this and \a other
@@ -56,14 +94,52 @@ min
   *
   * \sa min()
   */
-EIGEN_MAKE_CWISE_BINARY_OP(max,max)
+template <int NaNPropagation, typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const OtherDerived &other) const
+{
+  return (max<PropagateFast>)(other);
+}
 
 /** \returns an expression of the coefficient-wise max of \c *this and scalar \a other
   *
   * \sa min()
   */
+template <int NaNPropagation>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived,
+                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+max
+#else
+(max)
+#endif
+(const Scalar &other) const
+{
+  return (max<NaNPropagation>)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived,
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived,
                                         const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
 #ifdef EIGEN_PARSED_BY_DOXYGEN
 max
@@ -72,7 +148,7 @@ max
 #endif
 (const Scalar &other) const
 {
-  return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
+  return (max<PropagateFast>)(Derived::PlainObject::Constant(rows(), cols(), other));
 }
 
 /** \returns an expression of the coefficient-wise absdiff of \c *this and \a other
diff --git a/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
index b7ea22a9d8aae2f51682c5272432e660787f772a..13c55f4b115880c0becb6246bdc1fb86c9f404fa 100644
--- a/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ b/Eigen/src/plugins/ArrayCwiseUnaryOps.h
@@ -497,6 +497,45 @@ ceil() const
   return CeilReturnType(derived());
 }
 
+template<int N> struct ShiftRightXpr {
+  typedef CwiseUnaryOp<internal::scalar_shift_right_op<Scalar, N>, const Derived> Type;
+};
+
+/** \returns an expression of \c *this with the \a Scalar type arithmetically
+  * shifted right by \a N bit positions.
+  *
+  * The template parameter \a N specifies the number of bit positions to shift.
+  * 
+  * \sa shiftLeft()
+  */
+template<int N>
+EIGEN_DEVICE_FUNC
+typename ShiftRightXpr<N>::Type
+shiftRight() const
+{
+  return typename ShiftRightXpr<N>::Type(derived());
+}
+
+
+template<int N> struct ShiftLeftXpr {
+  typedef CwiseUnaryOp<internal::scalar_shift_left_op<Scalar, N>, const Derived> Type;
+};
+
+/** \returns an expression of \c *this with the \a Scalar type logically
+  * shifted left by \a N bit positions.
+  *
+  * The template parameter \a N specifies the number of bit positions to shift.
+  *
+  * \sa shiftRight()
+  */
+template<int N>
+EIGEN_DEVICE_FUNC
+typename ShiftLeftXpr<N>::Type
+shiftLeft() const
+{
+  return typename ShiftLeftXpr<N>::Type(derived());
+}
+
 /** \returns an expression of the coefficient-wise isnan of *this.
   *
   * Example: \include Cwise_isNaN.cpp
diff --git a/Eigen/src/plugins/CommonCwiseUnaryOps.h b/Eigen/src/plugins/CommonCwiseUnaryOps.h
index 42ff901ca5252b1068be6a2e111b336bda6f9b7f..5418dc4154f2b078e964383d6bb5acebe0bf3210 100644
--- a/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ b/Eigen/src/plugins/CommonCwiseUnaryOps.h
@@ -64,49 +64,6 @@ cast() const
   return typename CastXpr<NewType>::Type(derived());
 }
 
-template<int N> struct ShiftRightXpr {
-  typedef CwiseUnaryOp<internal::scalar_shift_right_op<Scalar, N>, const Derived> Type;
-};
-
-/// \returns an expression of \c *this with the \a Scalar type arithmetically
-/// shifted right by \a N bit positions.
-///
-/// The template parameter \a N specifies the number of bit positions to shift.
-///
-EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
-///
-/// \sa class CwiseUnaryOp
-///
-template<int N>
-EIGEN_DEVICE_FUNC
-typename ShiftRightXpr<N>::Type
-shift_right() const
-{
-  return typename ShiftRightXpr<N>::Type(derived());
-}
-
-
-template<int N> struct ShiftLeftXpr {
-  typedef CwiseUnaryOp<internal::scalar_shift_left_op<Scalar, N>, const Derived> Type;
-};
-
-/// \returns an expression of \c *this with the \a Scalar type logically
-/// shifted left by \a N bit positions.
-///
-/// The template parameter \a N specifies the number of bit positions to shift.
-///
-EIGEN_DOC_UNARY_ADDONS(cast,conversion function)
-///
-/// \sa class CwiseUnaryOp
-///
-template<int N>
-EIGEN_DEVICE_FUNC
-typename ShiftLeftXpr<N>::Type
-shift_left() const
-{
-  return typename ShiftLeftXpr<N>::Type(derived());
-}
-
 /// \returns an expression of the complex conjugate of \c *this.
 ///
 EIGEN_DOC_UNARY_ADDONS(conjugate,complex conjugate)
diff --git a/Eigen/src/plugins/IndexedViewMethods.h b/Eigen/src/plugins/IndexedViewMethods.h
index 5bfb19ac6cdfd5a66fdf15f001d72f7efc2a8300..15c35b0bf886fa964c38e86ce9386adefc01788a 100644
--- a/Eigen/src/plugins/IndexedViewMethods.h
+++ b/Eigen/src/plugins/IndexedViewMethods.h
@@ -90,8 +90,8 @@ operator()(const RowIndices& rowIndices, const ColIndices& colIndices) EIGEN_IND
   return BlockType(derived(),
                    internal::first(actualRowIndices),
                    internal::first(actualColIndices),
-                   internal::size(actualRowIndices),
-                   internal::size(actualColIndices));
+                   internal::index_list_size(actualRowIndices),
+                   internal::index_list_size(actualColIndices));
 }
 
 // The following overload returns a Scalar
@@ -168,7 +168,7 @@ operator()(const Indices& indices) EIGEN_INDEXED_VIEW_METHOD_CONST
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   typename IvcType<Indices>::type actualIndices = ivcSize(indices);
   return VectorBlock<EIGEN_INDEXED_VIEW_METHOD_CONST Derived,internal::array_size<Indices>::value>
-            (derived(), internal::first(actualIndices), internal::size(actualIndices));
+            (derived(), internal::first(actualIndices), internal::index_list_size(actualIndices));
 }
 
 template<typename IndexType>
diff --git a/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
index f1084abefbcdcb8368683fe01a4f01c71cc85621..514d83a71de96e3855b6a060a0300e3f1f54d29c 100644
--- a/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ b/Eigen/src/plugins/MatrixCwiseBinaryOps.h
@@ -39,10 +39,10 @@ cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
+inline const CwiseBinaryOp<numext::equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<numext::equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise != operator of *this and \a other
@@ -59,10 +59,10 @@ cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   */
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-inline const CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
+inline const CwiseBinaryOp<numext::not_equal_to<Scalar>, const Derived, const OtherDerived>
 cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return CwiseBinaryOp<numext::not_equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the coefficient-wise min of *this and \a other
@@ -72,23 +72,39 @@ cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
   *
   * \sa class CwiseBinaryOp, max()
   */
+template<int NaNPropagation, typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
+cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
 cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return cwiseMin<PropagateFast>(other);
 }
 
 /** \returns an expression of the coefficient-wise min of *this and scalar \a other
   *
   * \sa class CwiseBinaryOp, min()
   */
+template<int NaNPropagation>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NaNPropagation>, const Derived, const ConstantReturnType>
+cwiseMin(const Scalar &other) const
+{
+  return cwiseMin<NaNPropagation>(Derived::Constant(rows(), cols(), other));
+}
+
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar>, const Derived, const ConstantReturnType>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,PropagateFast>, const Derived, const ConstantReturnType>
 cwiseMin(const Scalar &other) const
 {
-  return cwiseMin(Derived::Constant(rows(), cols(), other));
+  return cwiseMin<PropagateFast>(Derived::Constant(rows(), cols(), other));
 }
 
 /** \returns an expression of the coefficient-wise max of *this and \a other
@@ -98,23 +114,39 @@ cwiseMin(const Scalar &other) const
   *
   * \sa class CwiseBinaryOp, min()
   */
+template<int NaNPropagation, typename OtherDerived>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>
+cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
+{
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const OtherDerived>(derived(), other.derived());
+}
+
 template<typename OtherDerived>
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const OtherDerived>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived, const OtherDerived>
 cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
 {
-  return CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
+  return cwiseMax<PropagateFast>(other);
 }
 
 /** \returns an expression of the coefficient-wise max of *this and scalar \a other
   *
   * \sa class CwiseBinaryOp, min()
   */
+template<int NaNPropagation>
+EIGEN_DEVICE_FUNC
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NaNPropagation>, const Derived, const ConstantReturnType>
+cwiseMax(const Scalar &other) const
+{
+  return cwiseMax<NaNPropagation>(Derived::Constant(rows(), cols(), other));
+}
+
 EIGEN_DEVICE_FUNC
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar>, const Derived, const ConstantReturnType>
+EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,PropagateFast>, const Derived, const ConstantReturnType>
 cwiseMax(const Scalar &other) const
 {
-  return cwiseMax(Derived::Constant(rows(), cols(), other));
+  return cwiseMax<PropagateFast>(Derived::Constant(rows(), cols(), other));
 }
 
 
diff --git a/bench/basicbenchmark.h b/bench/basicbenchmark.h
index 3fdc35732f02010997e1bcfd745854a60cdf69be..8059375b5e1fde71a28583b2c62f667f5cb84154 100644
--- a/bench/basicbenchmark.h
+++ b/bench/basicbenchmark.h
@@ -16,13 +16,13 @@ void benchBasic_loop(const MatrixType& I, MatrixType& m, int iterations)
     {
       asm("#begin_bench_loop LazyEval");
       if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
-      m = (I + 0.00005 * (m + m.lazy() * m)).eval();
+      m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
     }
     else if (Mode==OmpEval)
     {
       asm("#begin_bench_loop OmpEval");
       if (MatrixType::SizeAtCompileTime!=Eigen::Dynamic) asm("#fixedsize");
-      m = (I + 0.00005 * (m + m.lazy() * m)).evalOMP();
+      m = (I + 0.00005 * (m + m.lazyProduct(m))).eval();
     }
     else
     {
diff --git a/bench/btl/libs/STL/STL_interface.hh b/bench/btl/libs/STL/STL_interface.hh
index 16658c4baa1768e6de86513dde0badaf102e3b9c..5b391c6ef59d0781cf418d9f49e763d6ba69b0b6 100644
--- a/bench/btl/libs/STL/STL_interface.hh
+++ b/bench/btl/libs/STL/STL_interface.hh
@@ -84,9 +84,12 @@ public :
     for (int j=0;j<N;j++){
       for (int i=0;i<N;i++){
         somme=0.0;
+        if(i>=j)
+        {
         for (int k=0;k<N;k++)
           somme += A[i][k]*A[j][k];
         X[j][i]=somme;
+	}
       }
     }
   }
diff --git a/bench/spbench/CMakeLists.txt b/bench/spbench/CMakeLists.txt
index b1860049c0cb5607d4f2e221379ab211266accdf..75c36b0951b813c31654cf0ae0d2f58fde70831c 100644
--- a/bench/spbench/CMakeLists.txt
+++ b/bench/spbench/CMakeLists.txt
@@ -1,7 +1,7 @@
 
 
-set(BLAS_FOUND TRUE)
-set(LAPACK_FOUND TRUE)
+set(BLAS_FOUND EIGEN_BUILD_BLAS)
+set(LAPACK_FOUND EIGEN_BUILD_LAPACK)
 set(BLAS_LIBRARIES eigen_blas_static)
 set(LAPACK_LIBRARIES eigen_lapack_static)
 
diff --git a/bench/tensors/tensor_benchmarks_fp16_gpu.cu b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
index 65784d0d67f2566995d538e7c0df8358f02ab9ff..d63ff8bb2c28651b0cd1c51f9fa468de4e6be291 100644
--- a/bench/tensors/tensor_benchmarks_fp16_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_fp16_gpu.cu
@@ -10,7 +10,7 @@
 #define BM_FuncGPU(FUNC)                                                       \
   static void BM_##FUNC(int iters, int N) {                                    \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
     cudaDeviceSynchronize();                                                   \
@@ -40,7 +40,7 @@ BM_FuncGPU(fullReduction);
 #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
   static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, D1, D2, D3);   \
     cudaDeviceSynchronize();                                                   \
@@ -59,7 +59,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64);
 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
   static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, Eigen::half> suite(device, N);            \
     cudaDeviceSynchronize();                                                   \
diff --git a/bench/tensors/tensor_benchmarks_gpu.cu b/bench/tensors/tensor_benchmarks_gpu.cu
index 76d68c5c1b80cc7fa1b333417587bbf94a76b335..c778102645192b475fda75c81a39bac88617c49e 100644
--- a/bench/tensors/tensor_benchmarks_gpu.cu
+++ b/bench/tensors/tensor_benchmarks_gpu.cu
@@ -10,7 +10,7 @@
 #define BM_FuncGPU(FUNC)                                                       \
   static void BM_##FUNC(int iters, int N) {                                    \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
     cudaDeviceSynchronize();                                                   \
@@ -40,7 +40,7 @@ BM_FuncGPU(fullReduction);
 #define BM_FuncWithInputDimsGPU(FUNC, D1, D2, D3)                              \
   static void BM_##FUNC##_##D1##x##D2##x##D3(int iters, int N) {               \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, float> suite(device, D1, D2, D3);         \
     cudaDeviceSynchronize();                                                   \
@@ -59,7 +59,7 @@ BM_FuncWithInputDimsGPU(contraction, N, N, 64);
 #define BM_FuncWithKernelDimsGPU(FUNC, DIM1, DIM2)                             \
   static void BM_##FUNC##_##DIM1##x##DIM2(int iters, int N) {                  \
     StopBenchmarkTiming();                                                     \
-    Eigen::CudaStreamDevice stream;                                            \
+    Eigen::GpuStreamDevice stream;                                             \
     Eigen::GpuDevice device(&stream);                                          \
     BenchmarkSuite<Eigen::GpuDevice, float> suite(device, N);                  \
     cudaDeviceSynchronize();                                                   \
diff --git a/blas/CMakeLists.txt b/blas/CMakeLists.txt
index 545bc989c65196c22a3847698436d6deaf5e121d..c530957fbdecda8b21ae67ebc1b50e873fffc846 100644
--- a/blas/CMakeLists.txt
+++ b/blas/CMakeLists.txt
@@ -1,6 +1,7 @@
 
 project(EigenBlas CXX)
 
+if(EIGEN_BUILD_BLAS)
 include(CheckLanguage)
 check_language(Fortran)
 if(CMAKE_Fortran_COMPILER)
@@ -26,24 +27,31 @@ else()
   set(EigenBlas_SRCS ${EigenBlas_SRCS} f2c/complexdots.c)
 endif()
 
+set(EIGEN_BLAS_TARGETS "")
+
 add_library(eigen_blas_static ${EigenBlas_SRCS})
-add_library(eigen_blas SHARED ${EigenBlas_SRCS})
+list(APPEND EIGEN_BLAS_TARGETS eigen_blas_static)
 
-if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
-  target_link_libraries(eigen_blas_static ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
-  target_link_libraries(eigen_blas        ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+if (EIGEN_BUILD_SHARED_LIBS)
+  add_library(eigen_blas SHARED ${EigenBlas_SRCS})
+  list(APPEND EIGEN_BLAS_TARGETS eigen_blas)
 endif()
 
-add_dependencies(blas eigen_blas eigen_blas_static)
+foreach(target IN LISTS EIGEN_BLAS_TARGETS)
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+      target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
 
-install(TARGETS eigen_blas eigen_blas_static
-        RUNTIME DESTINATION bin
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
+  add_dependencies(blas ${target})
+  install(TARGETS ${target}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib)
+endforeach()
 
 if(EIGEN_Fortran_COMPILER_WORKS)
 
-if(BUILD_TESTING)
+if(EIGEN_BUILD_TESTING)
   if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
     add_subdirectory(testing) # can't do EXCLUDE_FROM_ALL here, breaks CTest
   else()
@@ -52,4 +60,4 @@ if(BUILD_TESTING)
 endif()
 
 endif()
-
+endif()
diff --git a/blas/level3_impl.h b/blas/level3_impl.h
index 6dd6338b43236b463ebc8cf18c6904795957a8ad..66216c96451159fb959e590b631d6701a1e45743 100644
--- a/blas/level3_impl.h
+++ b/blas/level3_impl.h
@@ -362,18 +362,18 @@ int EIGEN_BLAS_FUNC(syrk)(const char *uplo, const char *op, const int *n, const
   typedef void (*functype)(DenseIndex, DenseIndex, const Scalar *, DenseIndex, const Scalar *, DenseIndex, Scalar *, DenseIndex, DenseIndex, const Scalar&, internal::level3_blocking<Scalar,Scalar>&);
   static const functype func[8] = {
     // array index: NOTR  | (UP << 2)
-    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, 1, Upper>::run),
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj,ColMajor, 1, Upper>::run),
     // array index: TR    | (UP << 2)
-    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, 1, Upper>::run),
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,Conj,ColMajor, 1, Upper>::run),
     // array index: ADJ   | (UP << 2)
-    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,1, Upper>::run),
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor, 1, Upper>::run),
     0,
     // array index: NOTR  | (LO << 2)
-    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,ColMajor,Conj, 1, Lower>::run),
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,ColMajor,false,Scalar,RowMajor,Conj,ColMajor, 1, Lower>::run),
     // array index: TR    | (LO << 2)
-    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,ColMajor,Conj, 1, Lower>::run),
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,false,Scalar,ColMajor,Conj,ColMajor, 1, Lower>::run),
     // array index: ADJ   | (LO << 2)
-    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,ColMajor,false,1, Lower>::run),
+    (internal::general_matrix_matrix_triangular_product<DenseIndex,Scalar,RowMajor,Conj, Scalar,ColMajor,false,ColMajor, 1, Lower>::run),
     0
   };
   #endif
diff --git a/cmake/EigenConfigureTesting.cmake b/cmake/EigenConfigureTesting.cmake
index 9cb3bb20bbcc56fa206e0714fd832d66227d7a18..add6aab53e08ff717bf46fcaa8fabb4b45baa83a 100644
--- a/cmake/EigenConfigureTesting.cmake
+++ b/cmake/EigenConfigureTesting.cmake
@@ -11,6 +11,15 @@ add_custom_target(buildtests)
 add_custom_target(check COMMAND "ctest")
 add_dependencies(check buildtests)
 
+# Convenience target for only building GPU tests.
+add_custom_target(buildtests_gpu)
+add_custom_target(check_gpu COMMAND "ctest" "--output-on-failure" 
+                                            "--no-compress-output"
+                                            "--build-no-clean"
+                                            "-T" "test"
+                                            "-L" "gpu")
+add_dependencies(check_gpu buildtests_gpu)
+
 # check whether /bin/bash exists (disabled as not used anymore)
 # find_file(EIGEN_BIN_BASH_EXISTS "/bin/bash" PATHS "/" NO_DEFAULT_PATH)
 
diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake
index 0808446d6e11b5f45429e25719efacbea9de29d1..995354f059269619063cec58b408f0427feca66c 100644
--- a/cmake/EigenTesting.cmake
+++ b/cmake/EigenTesting.cmake
@@ -23,7 +23,9 @@ macro(ei_add_test_internal testname testname_with_suffix)
   set(EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}${targetname}\n")
   set_property(GLOBAL PROPERTY EIGEN_SUBTESTS_LIST "${EIGEN_SUBTESTS_LIST}")
 
+  set(is_gpu_test OFF)
   if(EIGEN_ADD_TEST_FILENAME_EXTENSION STREQUAL cu)
+    set(is_gpu_test ON)
     if(EIGEN_TEST_HIP)
       hip_reset_flags()
       hip_add_executable(${targetname} ${filename} HIPCC_OPTIONS "-DEIGEN_USE_HIP ${ARGV2}")
@@ -57,10 +59,10 @@ macro(ei_add_test_internal testname testname_with_suffix)
     add_executable(${targetname} ${filename})
   endif()
 
-  if (targetname MATCHES "^eigen2_")
-    add_dependencies(eigen2_buildtests ${targetname})
-  else()
-    add_dependencies(buildtests ${targetname})
+  add_dependencies(buildtests ${targetname})
+  
+  if (is_gpu_test)
+    add_dependencies(buildtests_gpu ${targetname})
   endif()
 
   if(EIGEN_NO_ASSERTION_CHECKING)
@@ -83,7 +85,7 @@ macro(ei_add_test_internal testname testname_with_suffix)
   endif()
 
   if(EIGEN_TEST_CUSTOM_CXX_FLAGS)
-    ei_add_target_property(${targetname} COMPILE_FLAGS "${EIGEN_TEST_CUSTOM_CXX_FLAGS}")
+    ei_add_target_property(${targetname} COMPILE_FLAGS ${EIGEN_TEST_CUSTOM_CXX_FLAGS})
   endif()
 
   if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
@@ -118,6 +120,11 @@ macro(ei_add_test_internal testname testname_with_suffix)
     add_dependencies("Build${current_subproject}" ${targetname})
     set_property(TEST ${testname_with_suffix} PROPERTY LABELS "${current_subproject}")
   endif()
+  if (is_gpu_test)
+    # Add gpu tag for testing only GPU tests.
+    set_property(TEST ${testname_with_suffix} APPEND PROPERTY LABELS "gpu")
+  endif()
+  
   if(EIGEN_SYCL)
     # Force include of the SYCL file at the end to avoid errors.
     set_property(TARGET ${targetname} PROPERTY COMPUTECPP_INCLUDE_AFTER 1)
@@ -478,6 +485,7 @@ macro(ei_get_compilerver VAR)
 
     execute_process(COMMAND ${CMAKE_CXX_COMPILER} ${EIGEN_CXX_FLAG_VERSION}
                     OUTPUT_VARIABLE eigen_cxx_compiler_version_string OUTPUT_STRIP_TRAILING_WHITESPACE)
+    string(REGEX REPLACE "^[ \n\r]+" "" eigen_cxx_compiler_version_string ${eigen_cxx_compiler_version_string})
     string(REGEX REPLACE "[\n\r].*"  ""  eigen_cxx_compiler_version_string  ${eigen_cxx_compiler_version_string})
 
     ei_get_compilerver_from_cxx_version_string("${eigen_cxx_compiler_version_string}" CNAME CVER)
@@ -487,9 +495,10 @@ macro(ei_get_compilerver VAR)
 endmacro()
 
 # Extract compiler name and version from a raw version string
-# WARNING: if you edit thid macro, then please test it by  uncommenting
+# WARNING: if you edit this macro, then please test it by uncommenting
 # the testing macro call in ei_init_testing() of the EigenTesting.cmake file.
-# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end of the file
+# See also the ei_test_get_compilerver_from_cxx_version_string macro at the end
+# of the file
 macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
   # extract possible compiler names
   string(REGEX MATCH "g\\+\\+"      ei_has_gpp    ${VERSTRING})
@@ -497,6 +506,7 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
   string(REGEX MATCH "gcc|GCC"      ei_has_gcc    ${VERSTRING})
   string(REGEX MATCH "icpc|ICC"     ei_has_icpc   ${VERSTRING})
   string(REGEX MATCH "clang|CLANG"  ei_has_clang  ${VERSTRING})
+  string(REGEX MATCH "mingw32"      ei_has_mingw  ${VERSTRING})
 
   # combine them
   if((ei_has_llvm) AND (ei_has_gpp OR ei_has_gcc))
@@ -505,6 +515,8 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
     set(${CNAME} "llvm-clang++")
   elseif(ei_has_clang)
     set(${CNAME} "clang++")
+  elseif ((ei_has_mingw) AND (ei_has_gpp OR ei_has_gcc))
+    set(${CNAME} "mingw32-g++")
   elseif(ei_has_icpc)
     set(${CNAME} "icpc")
   elseif(ei_has_gpp OR ei_has_gcc)
@@ -525,11 +537,17 @@ macro(ei_get_compilerver_from_cxx_version_string VERSTRING CNAME CVER)
       if(NOT eicver)
         # try to extract 2:
         string(REGEX MATCH "[^0-9][0-9]+\\.[0-9]+" eicver ${VERSTRING})
-      else()
-        set(eicver " _")
+        if (NOT eicver AND ei_has_mingw)
+          # try to extract 1 number plus suffix:
+          string(REGEX MATCH "[^0-9][0-9]+-win32" eicver ${VERSTRING})          
+        endif()
       endif()
     endif()
   endif()
+  
+  if (NOT eicver)
+    set(eicver " _")
+  endif()
 
   string(REGEX REPLACE ".(.*)" "\\1" ${CVER} ${eicver})
 
@@ -654,6 +672,7 @@ macro(ei_test_get_compilerver_from_cxx_version_string)
   ei_test1_get_compilerver_from_cxx_version_string("i686-apple-darwin11-llvm-g++-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2335.15.00)" "llvm-g++" "4.2.1")
   ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 4.4.6" "g++" "4.4.6")
   ei_test1_get_compilerver_from_cxx_version_string("g++-mp-4.4 (GCC) 2011" "g++" "4.4")
+  ei_test1_get_compilerver_from_cxx_version_string("x86_64-w64-mingw32-g++ (GCC) 10-win32 20210110" "mingw32-g++" "10-win32")
 endmacro()
 
 # Split all tests listed in EIGEN_TESTS_LIST into num_splits many targets
@@ -763,8 +782,7 @@ macro(ei_add_smoke_tests smoke_test_list)
     if ("${test}" IN_LIST EIGEN_SUBTESTS_LIST)
       add_dependencies("${buildtarget}" "${test}")
       # Add label smoketest to be able to run smoketests using ctest
-      get_property(test_labels TEST ${test} PROPERTY LABELS)
-      set_property(TEST ${test} PROPERTY LABELS "${test_labels};smoketest")
+      set_property(TEST ${test} APPEND PROPERTY LABELS "smoketest")
     endif()
   endforeach()
-endmacro(ei_add_smoke_tests)
\ No newline at end of file
+endmacro(ei_add_smoke_tests)
diff --git a/cmake/FindBLAS.cmake b/cmake/FindBLAS.cmake
index 7d1f81b032ea479b697bbd918e483086d2a4ebbf..1bb8f19652c9582dca81662e1bad7fb23a7aea1b 100644
--- a/cmake/FindBLAS.cmake
+++ b/cmake/FindBLAS.cmake
@@ -147,6 +147,7 @@ mark_as_advanced(BLAS_VERBOSE)
 
 include(CheckFunctionExists)
 include(CheckFortranFunctionExists)
+include(CMakeFindDependencyMacro)
 
 set(_blas_ORIG_CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES})
 
@@ -509,9 +510,9 @@ if (BLA_VENDOR MATCHES "Intel*" OR BLA_VENDOR STREQUAL "All")
 
     if (_LANGUAGES_ MATCHES C OR _LANGUAGES_ MATCHES CXX)
       if(BLAS_FIND_QUIETLY OR NOT BLAS_FIND_REQUIRED)
-	find_package(Threads)
+	find_dependency(Threads)
       else()
-	find_package(Threads REQUIRED)
+	find_dependency(Threads REQUIRED)
       endif()
 
       set(BLAS_SEARCH_LIBS "")
diff --git a/cmake/FindBLASEXT.cmake b/cmake/FindBLASEXT.cmake
index 0fe7fb84931bacf4f7879be412c6abf37c44d175..69a941897ef23c7c77d72fa41fc969e3c320b0df 100644
--- a/cmake/FindBLASEXT.cmake
+++ b/cmake/FindBLASEXT.cmake
@@ -41,18 +41,19 @@
 #  License text for the above reference.)
 
 # macro to factorize this call
+include(CMakeFindDependencyMacro)
 macro(find_package_blas)
   if(BLASEXT_FIND_REQUIRED)
     if(BLASEXT_FIND_QUIETLY)
-      find_package(BLAS REQUIRED QUIET)
+      find_dependency(BLAS REQUIRED QUIET)
     else()
-      find_package(BLAS REQUIRED)
+      find_dependency(BLAS REQUIRED)
     endif()
   else()
     if(BLASEXT_FIND_QUIETLY)
-      find_package(BLAS QUIET)
+      find_dependency(BLAS QUIET)
     else()
-      find_package(BLAS)
+      find_dependency(BLAS)
     endif()
   endif()
 endmacro()
@@ -316,7 +317,7 @@ if(BLA_VENDOR MATCHES "Intel*")
 	"\n   (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)")
       message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
     endif()
-    find_package_handle_standard_args(BLAS DEFAULT_MSG
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
       BLAS_SEQ_LIBRARIES
       BLAS_LIBRARY_DIRS
       BLAS_INCLUDE_DIRS)
@@ -324,14 +325,14 @@ if(BLA_VENDOR MATCHES "Intel*")
       if(NOT BLASEXT_FIND_QUIETLY)
 	message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES")
       endif()
-      find_package_handle_standard_args(BLAS DEFAULT_MSG
+      find_package_handle_standard_args(BLASEXT DEFAULT_MSG
 	BLAS_PAR_LIBRARIES)
     endif()
   else()
     if(NOT BLASEXT_FIND_QUIETLY)
       message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
     endif()
-    find_package_handle_standard_args(BLAS DEFAULT_MSG
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
       BLAS_SEQ_LIBRARIES
       BLAS_LIBRARY_DIRS
       BLAS_INCLUDE_DIRS)
@@ -343,14 +344,14 @@ elseif(BLA_VENDOR MATCHES "ACML*")
       "\n   (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)")
     message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
   endif()
-  find_package_handle_standard_args(BLAS DEFAULT_MSG
+  find_package_handle_standard_args(BLASEXT DEFAULT_MSG
     BLAS_SEQ_LIBRARIES
     BLAS_LIBRARY_DIRS)
   if(BLAS_PAR_LIBRARIES)
     if(NOT BLASEXT_FIND_QUIETLY)
       message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES")
     endif()
-    find_package_handle_standard_args(BLAS DEFAULT_MSG
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
       BLAS_PAR_LIBRARIES)
   endif()
 elseif(BLA_VENDOR MATCHES "IBMESSL*")
@@ -360,21 +361,24 @@ elseif(BLA_VENDOR MATCHES "IBMESSL*")
       "\n   (see BLAS_SEQ_LIBRARIES and BLAS_PAR_LIBRARIES)")
     message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
   endif()
-  find_package_handle_standard_args(BLAS DEFAULT_MSG
+  find_package_handle_standard_args(BLASEXT DEFAULT_MSG
     BLAS_SEQ_LIBRARIES
     BLAS_LIBRARY_DIRS)
   if(BLAS_PAR_LIBRARIES)
     if(NOT BLASEXT_FIND_QUIETLY)
       message(STATUS "BLAS parallel libraries stored in BLAS_PAR_LIBRARIES")
     endif()
-    find_package_handle_standard_args(BLAS DEFAULT_MSG
+    find_package_handle_standard_args(BLASEXT DEFAULT_MSG
       BLAS_PAR_LIBRARIES)
   endif()
 else()
   if(NOT BLASEXT_FIND_QUIETLY)
     message(STATUS "BLAS sequential libraries stored in BLAS_SEQ_LIBRARIES")
   endif()
-  find_package_handle_standard_args(BLAS DEFAULT_MSG
+  find_package_handle_standard_args(BLASEXT DEFAULT_MSG
     BLAS_SEQ_LIBRARIES
     BLAS_LIBRARY_DIRS)
 endif()
+
+# Callers expect BLAS_FOUND to be set as well.
+set(BLAS_FOUND BLASEXT_FOUND)
diff --git a/cmake/FindComputeCpp.cmake b/cmake/FindComputeCpp.cmake
index 3cca5150e8fb88927940ad6a77bbb81163917f46..1c271f0fecba7fcd510176040525088330c341f9 100644
--- a/cmake/FindComputeCpp.cmake
+++ b/cmake/FindComputeCpp.cmake
@@ -41,7 +41,8 @@ set(COMPUTECPP_BITCODE "spir64" CACHE STRING
   "Bitcode type to use as SYCL target in compute++")
 mark_as_advanced(COMPUTECPP_BITCODE)
 
-find_package(OpenCL REQUIRED)
+include(CMakeFindDependencyMacro)
+find_dependency(OpenCL REQUIRED)
 
 # Find ComputeCpp package
 
diff --git a/cmake/FindFFTW.cmake b/cmake/FindFFTW.cmake
index fad476d0da01ed2a4b2cb63c0481dcdca2b7768a..ed55c5fad6baa31628a2327541a03c1701055566 100644
--- a/cmake/FindFFTW.cmake
+++ b/cmake/FindFFTW.cmake
@@ -22,7 +22,8 @@ if( NOT FFTW_ROOT AND ENV{FFTWDIR} )
 endif()
 
 # Check if we can use PkgConfig
-find_package(PkgConfig)
+include(CMakeFindDependencyMacro)
+find_dependency(PkgConfig)
 
 #Determine from PKG
 if( PKG_CONFIG_FOUND AND NOT FFTW_ROOT )
diff --git a/cmake/FindHWLOC.cmake b/cmake/FindHWLOC.cmake
index 48329151892af498dc13871758069b4b1047b2fb..522f5215795ab937f1a815d5b9c1cf046138a265 100644
--- a/cmake/FindHWLOC.cmake
+++ b/cmake/FindHWLOC.cmake
@@ -65,8 +65,9 @@ endif()
 
 # Optionally use pkg-config to detect include/library dirs (if pkg-config is available)
 # -------------------------------------------------------------------------------------
-include(FindPkgConfig)
-find_package(PkgConfig QUIET)
+include(CMakeFindDependencyMacro)
+# include(FindPkgConfig)
+find_dependency(PkgConfig QUIET)
 if( PKG_CONFIG_EXECUTABLE AND NOT HWLOC_GIVEN_BY_USER )
 
   pkg_search_module(HWLOC hwloc)
diff --git a/cmake/FindLAPACK.cmake b/cmake/FindLAPACK.cmake
index 284a4529c7a4df4d255657e242a07f443802359c..3fd738807090183de5f57ac2466582cc7126c4a2 100644
--- a/cmake/FindLAPACK.cmake
+++ b/cmake/FindLAPACK.cmake
@@ -26,6 +26,7 @@
 
 
 include(CheckFunctionExists)
+include(CMakeFindDependencyMacro)
 
 # This macro checks for the existence of the combination of fortran libraries
 # given by _list.  If the combination is found, this macro checks (using the
@@ -88,7 +89,7 @@ macro(check_lapack_libraries DEFINITIONS LIBRARIES _prefix _name _flags _list _b
     set(${LIBRARIES}    ${_libraries_found})
     # Some C++ linkers require the f2c library to link with Fortran libraries.
     # I do not know which ones, thus I just add the f2c library if it is available.
-    find_package( F2C QUIET )
+    find_dependency( F2C QUIET )
     if ( F2C_FOUND )
       set(${DEFINITIONS}  ${${DEFINITIONS}} ${F2C_DEFINITIONS})
       set(${LIBRARIES}    ${${LIBRARIES}} ${F2C_LIBRARIES})
@@ -135,9 +136,9 @@ endmacro()
 
 # LAPACK requires BLAS
 if(LAPACK_FIND_QUIETLY OR NOT LAPACK_FIND_REQUIRED)
-  find_package(BLAS)
+  find_dependency(BLAS)
 else()
-  find_package(BLAS REQUIRED)
+  find_dependency(BLAS REQUIRED)
 endif()
 
 if (NOT BLAS_FOUND)
diff --git a/cmake/FindMPREAL.cmake b/cmake/FindMPREAL.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..947a1ce88678ddb4ffd7f49d38ac30e731afb332
--- /dev/null
+++ b/cmake/FindMPREAL.cmake
@@ -0,0 +1,103 @@
+# Try to find the MPFR C++ (MPREAL) library
+# See http://www.holoborodko.com/pavel/mpreal/
+#
+# This module supports requiring a minimum version, e.g. you can do
+#   find_package(MPREAL 1.8.6)
+# to require version 1.8.6 or newer of MPREAL C++.
+#
+# Once done this will define
+#
+#  MPREAL_FOUND - system has MPREAL lib with correct version
+#  MPREAL_INCLUDES - MPREAL required include directories
+#  MPREAL_LIBRARIES - MPREAL required libraries
+#  MPREAL_VERSION - MPREAL version
+
+# Copyright (c) 2020 The Eigen Authors.
+# Redistribution and use is allowed according to the terms of the BSD license.
+
+include(CMakeFindDependencyMacro)
+find_dependency(MPFR)
+find_dependency(GMP)
+
+# Set MPREAL_INCLUDES
+find_path(MPREAL_INCLUDES
+  NAMES
+  mpreal.h
+  PATHS
+  $ENV{GMPDIR}
+  ${INCLUDE_INSTALL_DIR}
+)
+
+# Set MPREAL_FIND_VERSION to 1.0.0 if no minimum version is specified
+
+if(NOT MPREAL_FIND_VERSION)
+  if(NOT MPREAL_FIND_VERSION_MAJOR)
+    set(MPREAL_FIND_VERSION_MAJOR 1)
+  endif()
+  if(NOT MPREAL_FIND_VERSION_MINOR)
+    set(MPREAL_FIND_VERSION_MINOR 0)
+  endif()
+  if(NOT MPREAL_FIND_VERSION_PATCH)
+    set(MPREAL_FIND_VERSION_PATCH 0)
+  endif()
+
+  set(MPREAL_FIND_VERSION "${MPREAL_FIND_VERSION_MAJOR}.${MPREAL_FIND_VERSION_MINOR}.${MPREAL_FIND_VERSION_PATCH}")
+endif()
+
+# Check bugs
+# - https://github.com/advanpix/mpreal/issues/7
+# - https://github.com/advanpix/mpreal/issues/9
+set(MPREAL_TEST_PROGRAM "
+#include <mpreal.h>
+#include <algorithm>
+int main(int argc, char** argv) {
+  const mpfr::mpreal one  =    1.0;
+  const mpfr::mpreal zero =    0.0;
+  using namespace std;
+  const mpfr::mpreal smaller = min(one, zero);
+  return 0;
+}")
+
+if(MPREAL_INCLUDES)
+
+  # Set MPREAL_VERSION
+  
+  file(READ "${MPREAL_INCLUDES}/mpreal.h" _mpreal_version_header)
+  
+  string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MAJOR[ \t]+([0-9]+)" _mpreal_major_version_match "${_mpreal_version_header}")
+  set(MPREAL_MAJOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPREAL_VERSION_MINOR[ \t]+([0-9]+)" _mpreal_minor_version_match "${_mpreal_version_header}")
+  set(MPREAL_MINOR_VERSION "${CMAKE_MATCH_1}")
+  string(REGEX MATCH "define[ \t]+MPREAL_VERSION_PATCHLEVEL[ \t]+([0-9]+)" _mpreal_patchlevel_version_match "${_mpreal_version_header}")
+  set(MPREAL_PATCHLEVEL_VERSION "${CMAKE_MATCH_1}")
+  
+  set(MPREAL_VERSION ${MPREAL_MAJOR_VERSION}.${MPREAL_MINOR_VERSION}.${MPREAL_PATCHLEVEL_VERSION})
+  
+  # Check whether found version exceeds minimum version
+  
+  if(${MPREAL_VERSION} VERSION_LESS ${MPREAL_FIND_VERSION})
+    set(MPREAL_VERSION_OK FALSE)
+    message(STATUS "MPREAL version ${MPREAL_VERSION} found in ${MPREAL_INCLUDES}, "
+                   "but at least version ${MPREAL_FIND_VERSION} is required")
+  else()
+    set(MPREAL_VERSION_OK TRUE)
+    
+    list(APPEND MPREAL_INCLUDES "${MPFR_INCLUDES}" "${GMP_INCLUDES}")
+    list(REMOVE_DUPLICATES MPREAL_INCLUDES)
+    
+    list(APPEND MPREAL_LIBRARIES "${MPFR_LIBRARIES}" "${GMP_LIBRARIES}")
+    list(REMOVE_DUPLICATES MPREAL_LIBRARIES)
+    
+    # Make sure it compiles with the current compiler.
+    unset(MPREAL_WORKS CACHE)
+    include(CheckCXXSourceCompiles)
+    set(CMAKE_REQUIRED_INCLUDES "${MPREAL_INCLUDES}")
+    set(CMAKE_REQUIRED_LIBRARIES "${MPREAL_LIBRARIES}")
+    check_cxx_source_compiles("${MPREAL_TEST_PROGRAM}" MPREAL_WORKS)
+  endif()
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(MPREAL DEFAULT_MSG
+                                  MPREAL_INCLUDES MPREAL_VERSION_OK MPREAL_WORKS)
+mark_as_advanced(MPREAL_INCLUDES)
diff --git a/cmake/FindPastix.cmake b/cmake/FindPASTIX.cmake
similarity index 96%
rename from cmake/FindPastix.cmake
rename to cmake/FindPASTIX.cmake
index 3b47d5ce33bce84374148c4090e3fb14cd666cbd..db1427b0a4b2ee16452aa4110c2a2bae36e62ab0 100644
--- a/cmake/FindPastix.cmake
+++ b/cmake/FindPASTIX.cmake
@@ -118,7 +118,7 @@ if( PASTIX_FIND_COMPONENTS )
     if (${component} STREQUAL "SCOTCH")
       set(PASTIX_LOOK_FOR_SCOTCH ON)
     endif()
-    if (${component} STREQUAL "SCOTCH")
+    if (${component} STREQUAL "PTSCOTCH")
       set(PASTIX_LOOK_FOR_PTSCOTCH ON)
     endif()
     if (${component} STREQUAL "METIS")
@@ -133,14 +133,14 @@ endif()
 
 # Required dependencies
 # ---------------------
-
+include(CMakeFindDependencyMacro)
 if (NOT PASTIX_FIND_QUIETLY)
   message(STATUS "Looking for PASTIX - Try to detect pthread")
 endif()
 if (PASTIX_FIND_REQUIRED)
-  find_package(Threads REQUIRED QUIET)
+  find_dependency(Threads REQUIRED QUIET)
 else()
-  find_package(Threads QUIET)
+  find_dependency(Threads QUIET)
 endif()
 set(PASTIX_EXTRA_LIBRARIES "")
 if( THREADS_FOUND )
@@ -198,9 +198,9 @@ if (NOT PASTIX_FIND_QUIETLY)
   message(STATUS "Looking for PASTIX - Try to detect HWLOC")
 endif()
 if (PASTIX_FIND_REQUIRED)
-  find_package(HWLOC REQUIRED QUIET)
+  find_dependency(HWLOC REQUIRED QUIET)
 else()
-  find_package(HWLOC QUIET)
+  find_dependency(HWLOC QUIET)
 endif()
 
 # PASTIX depends on BLAS
@@ -209,9 +209,9 @@ if (NOT PASTIX_FIND_QUIETLY)
   message(STATUS "Looking for PASTIX - Try to detect BLAS")
 endif()
 if (PASTIX_FIND_REQUIRED)
-  find_package(BLASEXT REQUIRED QUIET)
+  find_dependency(BLASEXT REQUIRED QUIET)
 else()
-  find_package(BLASEXT QUIET)
+  find_dependency(BLASEXT QUIET)
 endif()
 
 # Optional dependencies
@@ -230,9 +230,9 @@ if (NOT MPI_FOUND AND PASTIX_LOOK_FOR_MPI)
     set(MPI_C_COMPILER mpicc)
   endif()
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_MPI)
-    find_package(MPI REQUIRED QUIET)
+    find_dependency(MPI REQUIRED QUIET)
   else()
-    find_package(MPI QUIET)
+    find_dependency(MPI QUIET)
   endif()
   if (MPI_FOUND)
     mark_as_advanced(MPI_LIBRARY)
@@ -272,10 +272,10 @@ if( NOT STARPU_FOUND AND PASTIX_LOOK_FOR_STARPU)
   endif()
   # set the list of optional dependencies we may discover
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_STARPU)
-    find_package(STARPU ${PASTIX_STARPU_VERSION} REQUIRED
+    find_dependency(STARPU ${PASTIX_STARPU_VERSION} REQUIRED
       COMPONENTS ${STARPU_COMPONENT_LIST})
   else()
-    find_package(STARPU ${PASTIX_STARPU_VERSION}
+    find_dependency(STARPU ${PASTIX_STARPU_VERSION}
       COMPONENTS ${STARPU_COMPONENT_LIST})
   endif()
 
@@ -288,9 +288,9 @@ if (NOT SCOTCH_FOUND AND PASTIX_LOOK_FOR_SCOTCH)
     message(STATUS "Looking for PASTIX - Try to detect SCOTCH")
   endif()
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_SCOTCH)
-    find_package(SCOTCH REQUIRED QUIET)
+    find_dependency(SCOTCH REQUIRED QUIET)
   else()
-    find_package(SCOTCH QUIET)
+    find_dependency(SCOTCH QUIET)
   endif()
 endif()
 
@@ -301,9 +301,9 @@ if (NOT PTSCOTCH_FOUND AND PASTIX_LOOK_FOR_PTSCOTCH)
     message(STATUS "Looking for PASTIX - Try to detect PTSCOTCH")
   endif()
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_PTSCOTCH)
-    find_package(PTSCOTCH REQUIRED QUIET)
+    find_dependency(PTSCOTCH REQUIRED QUIET)
   else()
-    find_package(PTSCOTCH QUIET)
+    find_dependency(PTSCOTCH QUIET)
   endif()
 endif()
 
@@ -314,9 +314,9 @@ if (NOT METIS_FOUND AND PASTIX_LOOK_FOR_METIS)
     message(STATUS "Looking for PASTIX - Try to detect METIS")
   endif()
   if (PASTIX_FIND_REQUIRED AND PASTIX_FIND_REQUIRED_METIS)
-    find_package(METIS REQUIRED QUIET)
+    find_dependency(METIS REQUIRED QUIET)
   else()
-    find_package(METIS QUIET)
+    find_dependency(METIS QUIET)
   endif()
 endif()
 
diff --git a/cmake/FindPTSCOTCH.cmake b/cmake/FindPTSCOTCH.cmake
index 51eecf1af866e16c50eb7866d3e1ff469dcc940d..6ccc743e68cc5711fce90bf74ccb611cebd4759d 100644
--- a/cmake/FindPTSCOTCH.cmake
+++ b/cmake/FindPTSCOTCH.cmake
@@ -79,20 +79,21 @@ if( PTSCOTCH_FIND_COMPONENTS )
 endif()
 
 # PTSCOTCH depends on Threads, try to find it
+include(CMakeFindDependencyMacro)
 if (NOT THREADS_FOUND)
   if (PTSCOTCH_FIND_REQUIRED)
-    find_package(Threads REQUIRED)
+    find_dependency(Threads REQUIRED)
   else()
-    find_package(Threads)
+    find_dependency(Threads)
   endif()
 endif()
 
 # PTSCOTCH depends on MPI, try to find it
 if (NOT MPI_FOUND)
   if (PTSCOTCH_FIND_REQUIRED)
-    find_package(MPI REQUIRED)
+    find_dependency(MPI REQUIRED)
   else()
-    find_package(MPI)
+    find_dependency(MPI)
   endif()
 endif()
 
@@ -148,18 +149,18 @@ else()
     foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
       set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND")
       find_path(PTSCOTCH_${ptscotch_hdr}_DIRS
-	NAMES ${ptscotch_hdr}
-	HINTS ${PTSCOTCH_DIR}
-	PATH_SUFFIXES "include" "include/scotch")
+        NAMES ${ptscotch_hdr}
+        HINTS ${PTSCOTCH_DIR}
+        PATH_SUFFIXES "include" "include/scotch")
       mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS)
     endforeach()
   else()
     foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
       set(PTSCOTCH_${ptscotch_hdr}_DIRS "PTSCOTCH_${ptscotch_hdr}_DIRS-NOTFOUND")
       find_path(PTSCOTCH_${ptscotch_hdr}_DIRS
-	NAMES ${ptscotch_hdr}
-	HINTS ${_inc_env}
-	PATH_SUFFIXES "scotch")
+        NAMES ${ptscotch_hdr}
+        HINTS ${_inc_env}
+        PATH_SUFFIXES "scotch")
       mark_as_advanced(PTSCOTCH_${ptscotch_hdr}_DIRS)
     endforeach()
   endif()
@@ -171,7 +172,6 @@ foreach(ptscotch_hdr ${PTSCOTCH_hdrs_to_find})
   if (PTSCOTCH_${ptscotch_hdr}_DIRS)
     list(APPEND PTSCOTCH_INCLUDE_DIRS "${PTSCOTCH_${ptscotch_hdr}_DIRS}")
   else ()
-    set(PTSCOTCH_INCLUDE_DIRS "PTSCOTCH_INCLUDE_DIRS-NOTFOUND")
     if (NOT PTSCOTCH_FIND_QUIETLY)
       message(STATUS "Looking for ptscotch -- ${ptscotch_hdr} not found")
     endif()
@@ -229,16 +229,16 @@ else()
     foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
       set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND")
       find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY
-	NAMES ${ptscotch_lib}
-	HINTS ${PTSCOTCH_DIR}
-	PATH_SUFFIXES lib lib32 lib64)
+        NAMES ${ptscotch_lib}
+        HINTS ${PTSCOTCH_DIR}
+        PATH_SUFFIXES lib lib32 lib64)
     endforeach()
   else()
     foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
       set(PTSCOTCH_${ptscotch_lib}_LIBRARY "PTSCOTCH_${ptscotch_lib}_LIBRARY-NOTFOUND")
       find_library(PTSCOTCH_${ptscotch_lib}_LIBRARY
-	NAMES ${ptscotch_lib}
-	HINTS ${_lib_env})
+        NAMES ${ptscotch_lib}
+        HINTS ${_lib_env})
     endforeach()
   endif()
 endif()
@@ -255,7 +255,6 @@ foreach(ptscotch_lib ${PTSCOTCH_libs_to_find})
     list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}")
     list(APPEND PTSCOTCH_LIBRARY_DIRS "${${ptscotch_lib}_lib_path}")
   else ()
-    list(APPEND PTSCOTCH_LIBRARIES "${PTSCOTCH_${ptscotch_lib}_LIBRARY}")
     if (NOT PTSCOTCH_FIND_QUIETLY)
       message(STATUS "Looking for ptscotch -- lib ${ptscotch_lib} not found")
     endif()
diff --git a/cmake/FindScotch.cmake b/cmake/FindSCOTCH.cmake
similarity index 99%
rename from cmake/FindScotch.cmake
rename to cmake/FindSCOTCH.cmake
index af00eb0f2565868e5470739cf662ce664347a2aa..11b971a926443aa415fb28759f6aa2f04f703e41 100644
--- a/cmake/FindScotch.cmake
+++ b/cmake/FindSCOTCH.cmake
@@ -71,11 +71,12 @@ if( SCOTCH_FIND_COMPONENTS )
 endif()
 
 # SCOTCH may depend on Threads, try to find it
+include(CMakeFindDependencyMacro)
 if (NOT THREADS_FOUND)
   if (SCOTCH_FIND_REQUIRED)
-    find_package(Threads REQUIRED)
+    find_dependency(Threads REQUIRED)
   else()
-    find_package(Threads)
+    find_dependency(Threads)
   endif()
 endif()
 
diff --git a/cmake/FindTriSYCL.cmake b/cmake/FindTriSYCL.cmake
index 41bc2fa894b29b55de903af46ddf640333a9793b..81042390729788ca8a644e52ba901af74d355120 100644
--- a/cmake/FindTriSYCL.cmake
+++ b/cmake/FindTriSYCL.cmake
@@ -57,18 +57,19 @@ mark_as_advanced(TRISYCL_DEBUG_STRUCTORS)
 mark_as_advanced(TRISYCL_TRACE_KERNEL)
 
 #triSYCL definitions
-set(CL_SYCL_LANGUAGE_VERSION 220 CACHE VERSION
+set(CL_SYCL_LANGUAGE_VERSION 220 CACHE STRING
   "Host language version to be used by trisYCL (default is: 220)")
-set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE VERSION
+set(TRISYCL_CL_LANGUAGE_VERSION 220 CACHE STRING
   "Device language version to be used by trisYCL (default is: 220)")
-#set(TRISYCL_COMPILE_OPTIONS "-std=c++1z -Wall -Wextra")
-set(CMAKE_CXX_STANDARD 14)
+# triSYCL now requires c++17
+set(CMAKE_CXX_STANDARD 17)
 set(CXX_STANDARD_REQUIRED ON)
 
 
 # Find OpenCL package
+include(CMakeFindDependencyMacro)
 if(TRISYCL_OPENCL)
-  find_package(OpenCL REQUIRED)
+  find_dependency(OpenCL REQUIRED)
   if(UNIX)
     set(BOOST_COMPUTE_INCPATH /usr/include/compute CACHE PATH
       "Path to Boost.Compute headers (default is: /usr/include/compute)")
@@ -77,11 +78,11 @@ endif()
 
 # Find OpenMP package
 if(TRISYCL_OPENMP)
-  find_package(OpenMP REQUIRED)
+  find_dependency(OpenMP REQUIRED)
 endif()
 
 # Find Boost
-find_package(Boost 1.58 REQUIRED COMPONENTS chrono log)
+find_dependency(Boost 1.58 REQUIRED COMPONENTS chrono log)
 
 # If debug or trace we need boost log
 if(TRISYCL_DEBUG OR TRISYCL_DEBUG_STRUCTORS OR TRISYCL_TRACE_KERNEL)
@@ -90,9 +91,23 @@ else()
   set(LOG_NEEDED OFF)
 endif()
 
-find_package(Threads REQUIRED)
+find_dependency(Threads REQUIRED)
 
 # Find triSYCL directory
+if (TRISYCL_INCLUDES AND TRISYCL_LIBRARIES)
+  set(TRISYCL_FIND_QUIETLY TRUE)
+endif ()
+
+find_path(TRISYCL_INCLUDE_DIR
+  NAMES sycl.hpp
+  PATHS $ENV{TRISYCLDIR} $ENV{TRISYCLDIR}/include ${INCLUDE_INSTALL_DIR}
+  PATH_SUFFIXES triSYCL
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(TriSYCL DEFAULT_MSG
+                                  TRISYCL_INCLUDE_DIR)
+
 if(NOT TRISYCL_INCLUDE_DIR)
   message(FATAL_ERROR
     "triSYCL include directory - Not found! (please set TRISYCL_INCLUDE_DIR")
@@ -100,36 +115,42 @@ else()
   message(STATUS "triSYCL include directory - Found ${TRISYCL_INCLUDE_DIR}")
 endif()
 
+include(CMakeParseArguments)
 #######################
 #  add_sycl_to_target
 #######################
-#
-#  Sets the proper flags and includes for the target compilation.
-#
-#  targetName : Name of the target to add a SYCL to.
-#  sourceFile : Source file to be compiled for SYCL.
-#  binaryDir : Intermediate directory to output the integration header.
-#
-function(add_sycl_to_target targetName sourceFile binaryDir)
+function(add_sycl_to_target)
+  set(options)
+  set(one_value_args
+    TARGET
+  )
+  set(multi_value_args
+    SOURCES
+  )
+  cmake_parse_arguments(ADD_SYCL_ARGS
+    "${options}"
+    "${one_value_args}"
+    "${multi_value_args}"
+    ${ARGN}
+  )
 
   # Add include directories to the "#include <>" paths
-  target_include_directories (${targetName} PUBLIC
+  target_include_directories (${ADD_SYCL_ARGS_TARGET} PUBLIC
     ${TRISYCL_INCLUDE_DIR}
     ${Boost_INCLUDE_DIRS}
     $<$<BOOL:${TRISYCL_OPENCL}>:${OpenCL_INCLUDE_DIRS}>
     $<$<BOOL:${TRISYCL_OPENCL}>:${BOOST_COMPUTE_INCPATH}>)
 
-
   # Link dependencies
-  target_link_libraries(${targetName} PUBLIC
+  target_link_libraries(${ADD_SYCL_ARGS_TARGET}
     $<$<BOOL:${TRISYCL_OPENCL}>:${OpenCL_LIBRARIES}>
     Threads::Threads
     $<$<BOOL:${LOG_NEEDED}>:Boost::log>
     Boost::chrono)
 
-
   # Compile definitions
-  target_compile_definitions(${targetName} PUBLIC
+  target_compile_definitions(${ADD_SYCL_ARGS_TARGET} PUBLIC
+    EIGEN_SYCL_TRISYCL
     $<$<BOOL:${TRISYCL_NO_ASYNC}>:TRISYCL_NO_ASYNC>
     $<$<BOOL:${TRISYCL_OPENCL}>:TRISYCL_OPENCL>
     $<$<BOOL:${TRISYCL_DEBUG}>:TRISYCL_DEBUG>
@@ -138,13 +159,13 @@ function(add_sycl_to_target targetName sourceFile binaryDir)
     $<$<BOOL:${LOG_NEEDED}>:BOOST_LOG_DYN_LINK>)
 
   # C++ and OpenMP requirements
-  target_compile_options(${targetName} PUBLIC
+  target_compile_options(${ADD_SYCL_ARGS_TARGET} PUBLIC
     ${TRISYCL_COMPILE_OPTIONS}
     $<$<BOOL:${TRISYCL_OPENMP}>:${OpenMP_CXX_FLAGS}>)
 
   if(${TRISYCL_OPENMP} AND (NOT WIN32))
     # Does not support generator expressions
-    set_target_properties(${targetName}
+    set_target_properties(${ADD_SYCL_ARGS_TARGET}
       PROPERTIES
       LINK_FLAGS ${OpenMP_CXX_FLAGS})
   endif()
diff --git a/doc/CustomizingEigen_Plugins.dox b/doc/CustomizingEigen_Plugins.dox
index d88f2409b955f8b03884459366276bc92b42f388..9ab0200ff946dc1be9e0d6465ddff920a83a2314 100644
--- a/doc/CustomizingEigen_Plugins.dox
+++ b/doc/CustomizingEigen_Plugins.dox
@@ -59,7 +59,7 @@ operator+(const Scalar& scalar, const MatrixBase<Derived>& mat)
 { return CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ConstantReturnType, Derived>(Constant(rows(),cols(),scalar), mat.derived()); }
 \endcode
 
-Then one can the following declaration in the config.h or whatever prerequisites header file of his project:
+Then one can add the following declaration in the config.h or whatever prerequisites header file of his project:
 \code
 #define EIGEN_MATRIXBASE_PLUGIN "MatrixBaseAddons.h"
 \endcode
diff --git a/doc/FunctionsTakingEigenTypes.dox b/doc/FunctionsTakingEigenTypes.dox
index 6b4e49214c295cb67eb3cf3a01767661684a0765..3e745462cd42747c6f1a47898d7d32b52f1c9e56 100644
--- a/doc/FunctionsTakingEigenTypes.dox
+++ b/doc/FunctionsTakingEigenTypes.dox
@@ -126,7 +126,7 @@ and contrary to what one might think at first, this implementation is fine unles
 MatrixXf x,y,z;
 MatrixXf C = cov(x,y+z);
 \endcode
-In this special case, the example is fine and will be working because both parameters are declared as \e const references. The compiler creates a temporary and evaluates the expression x+z into this temporary. Once the function is processed, the temporary is released and the result is assigned to C.
+In this special case, the example is fine and will be working because both parameters are declared as \e const references. The compiler creates a temporary and evaluates the expression y+z into this temporary. Once the function is processed, the temporary is released and the result is assigned to C.
 
 \b Note: Functions taking \e const references to Matrix (or Array) can process expressions at the cost of temporaries.
 
diff --git a/doc/LeastSquares.dox b/doc/LeastSquares.dox
index 24dfe4b4f47b7f0cef94401820f0598f5016d119..ddbf38dec9b27f7fd5bac0d74efc5af9dfe77ca8 100644
--- a/doc/LeastSquares.dox
+++ b/doc/LeastSquares.dox
@@ -30,14 +30,17 @@ computing least squares solutions:
 </table>
 
 This is example from the page \link TutorialLinearAlgebra Linear algebra and decompositions \endlink.
+If you just need to solve the least squares problem, but are not interested in the SVD per se, a
+faster alternative method is CompleteOrthogonalDecomposition. 
 
 
 \section LeastSquaresQR Using the QR decomposition
 
 The solve() method in QR decomposition classes also computes the least squares solution. There are
-three QR decomposition classes: HouseholderQR (no pivoting, so fast but unstable),
-ColPivHouseholderQR (column pivoting, thus a bit slower but more accurate) and FullPivHouseholderQR
-(full pivoting, so slowest and most stable). Here is an example with column pivoting:
+three QR decomposition classes: HouseholderQR (no pivoting, fast but unstable if your matrix is
+not rull rank), ColPivHouseholderQR (column pivoting, thus a bit slower but more stable) and
+FullPivHouseholderQR (full pivoting, so slowest and slightly more stable than ColPivHouseholderQR).
+Here is an example with column pivoting:
 
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -61,9 +64,11 @@ Finding the least squares solution of \a Ax = \a b is equivalent to solving the
 </tr>
 </table>
 
-If the matrix \a A is ill-conditioned, then this is not a good method, because the condition number
+This method is usually the fastest, especially when \a A is "tall and skinny". However, if the
+matrix \a A is even mildly ill-conditioned, this is not a good method, because the condition number
 of <i>A</i><sup>T</sup><i>A</i> is the square of the condition number of \a A. This means that you
-lose twice as many digits using normal equation than if you use the other methods.
+lose roughly twice as many digits of accuracy using the normal equation, compared to the more stable
+methods mentioned above.
 
 */
 
diff --git a/doc/SparseLinearSystems.dox b/doc/SparseLinearSystems.dox
index 38754e4afbee9248f94a7cc28cafc9660337f52f..66d3bcd3c6f41c7e50d9febae17f925e014960a9 100644
--- a/doc/SparseLinearSystems.dox
+++ b/doc/SparseLinearSystems.dox
@@ -13,24 +13,20 @@ They are summarized in the following tables:
 
 <table class="manual">
 <tr><th>Class</th><th>Solver kind</th><th>Matrix kind</th><th>Features related to performance</th>
-    <th>License</th><th class="width20em"><p>Notes</p></th></tr>
+    <th class="width20em"><p>Notes</p></th></tr>
 
 <tr><td>SimplicialLLT \n <tt>\#include<Eigen/\link SparseCholesky_Module SparseCholesky\endlink></tt></td><td>Direct LLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
-    <td>LGPL</td>
     <td>SimplicialLDLT is often preferable</td></tr>
 
 <tr><td>SimplicialLDLT \n <tt>\#include<Eigen/\link SparseCholesky_Module SparseCholesky\endlink></tt></td><td>Direct LDLt factorization</td><td>SPD</td><td>Fill-in reducing</td>
-    <td>LGPL</td>
     <td>Recommended for very sparse and not too large problems (e.g., 2D Poisson eq.)</td></tr>
 
 <tr><td>SparseLU \n <tt>\#include<Eigen/\link SparseLU_Module SparseLU\endlink></tt></td> <td>LU factorization </td>
     <td>Square </td><td>Fill-in reducing, Leverage fast dense algebra</td>
-    <td>MPL2</td>
     <td>optimized for small and large problems with irregular patterns </td></tr>
 
 <tr><td>SparseQR \n <tt>\#include<Eigen/\link SparseQR_Module SparseQR\endlink></tt></td> <td> QR factorization</td>
     <td>Any, rectangular</td><td> Fill-in reducing</td>
-    <td>MPL2</td>
     <td>recommended for least-square problems, has a basic rank-revealing feature</td></tr>
  </table>
 
@@ -38,21 +34,18 @@ They are summarized in the following tables:
 
 <table class="manual">
 <tr><th>Class</th><th>Solver kind</th><th>Matrix kind</th><th>Supported preconditioners, [default]</th>
-    <th>License</th><th class="width20em"><p>Notes</p></th></tr>
+    <th class="width20em"><p>Notes</p></th></tr>
 
 <tr><td>ConjugateGradient \n <tt>\#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td> <td>Classic iterative CG</td><td>SPD</td>
     <td>IdentityPreconditioner, [DiagonalPreconditioner], IncompleteCholesky</td>
-    <td>MPL2</td>
     <td>Recommended for large symmetric problems (e.g., 3D Poisson eq.)</td></tr>
 
 <tr><td>LeastSquaresConjugateGradient \n <tt>\#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td><td>CG for rectangular least-square problem</td><td>Rectangular</td>
     <td>IdentityPreconditioner, [LeastSquareDiagonalPreconditioner]</td>
-    <td>MPL2</td>
     <td>Solve for min |A'Ax-b|^2 without forming A'A</td></tr>
 
 <tr><td>BiCGSTAB \n <tt>\#include<Eigen/\link IterativeLinearSolvers_Module IterativeLinearSolvers\endlink></tt></td><td>Iterative stabilized bi-conjugate gradient</td><td>Square</td>
     <td>IdentityPreconditioner, [DiagonalPreconditioner], IncompleteLUT</td>
-    <td>MPL2</td>
     <td>To speedup the convergence, try it with the \ref IncompleteLUT preconditioner.</td></tr>
 </table>
 
diff --git a/doc/SparseQuickReference.dox b/doc/SparseQuickReference.dox
index 9779f3f9c88bd52e39f45f37a54f582de2ebe2a7..14a5891312853dbcc85e64f7d32cca4f98dc4fdb 100644
--- a/doc/SparseQuickReference.dox
+++ b/doc/SparseQuickReference.dox
@@ -153,7 +153,7 @@ It is easy to perform arithmetic operations on sparse matrices provided that the
 \code 
 perm.indices();      // Reference to the vector of indices
 sm1.twistedBy(perm); // Permute rows and columns
-sm2 = sm1 * perm;    // Permute the columns
+sm2 = sm1 * perm;    // Permute the rows
 sm2 = perm * sm1;    // Permute the columns
 \endcode 
 </td>
diff --git a/doc/TopicLinearAlgebraDecompositions.dox b/doc/TopicLinearAlgebraDecompositions.dox
index 0965da87248933a49f6474aa09c6b03553d21fe1..402b3769e2f3e44e6c145e9d0bdeefe82d9a6462 100644
--- a/doc/TopicLinearAlgebraDecompositions.dox
+++ b/doc/TopicLinearAlgebraDecompositions.dox
@@ -72,7 +72,7 @@ To get an overview of the true relative speed of the different decompositions, c
         <td>Orthogonalization</td>
         <td>Yes</td>
         <td>Excellent</td>
-        <td><em>Soon: blocking</em></td>
+        <td><em>-</em></td>
     </tr>
 
     <tr>
@@ -88,6 +88,18 @@ To get an overview of the true relative speed of the different decompositions, c
     </tr>
 
     <tr class="alt">
+        <td>CompleteOrthogonalDecomposition</td>
+        <td>-</td>
+        <td>Fast</td>
+        <td>Good</td>
+        <td>Yes</td>
+        <td>Orthogonalization</td>
+        <td>Yes</td>
+        <td>Excellent</td>
+        <td><em>-</em></td>
+    </tr>
+
+    <tr>
         <td>LLT</td>
         <td>Positive definite</td>
         <td>Very fast</td>
@@ -99,7 +111,7 @@ To get an overview of the true relative speed of the different decompositions, c
         <td>Blocking</td>
     </tr>
 
-    <tr>
+    <tr class="alt">
         <td>LDLT</td>
         <td>Positive or negative semidefinite<sup><a href="#note1">1</a></sup></td>
         <td>Very fast</td>
diff --git a/doc/TutorialBlockOperations.dox b/doc/TutorialBlockOperations.dox
index a2d8c97cc094f62b0bd64278658272792362295a..df277482c897e23675520e21e0fb07b50ff0cce5 100644
--- a/doc/TutorialBlockOperations.dox
+++ b/doc/TutorialBlockOperations.dox
@@ -167,6 +167,20 @@ matrix.rightCols(q);\endcode </td>
     <td>\code 
 matrix.rightCols<q>();\endcode </td>
 </tr>
+<tr><td>%Block containing the q columns starting from i
+                    \link DenseBase::middleCols() * \endlink</td>
+    <td>\code
+matrix.middleCols(i,q);\endcode </td>
+    <td>\code 
+matrix.middleCols<q>(i);\endcode </td>
+</tr>
+<tr><td>%Block containing the q rows starting from i
+                    \link DenseBase::middleRows() * \endlink</td>
+    <td>\code
+matrix.middleRows(i,q);\endcode </td>
+    <td>\code 
+matrix.middleRows<q>(i);\endcode </td>
+</tr>
 </table>
 
 Here is a simple example illustrating the use of the operations presented above:
diff --git a/doc/TutorialLinearAlgebra.dox b/doc/TutorialLinearAlgebra.dox
index a72724143c693fd918268be84e722e093c2c65bf..8042fcad333788acc479a97d1b39809d3bcefd0a 100644
--- a/doc/TutorialLinearAlgebra.dox
+++ b/doc/TutorialLinearAlgebra.dox
@@ -14,7 +14,7 @@ QR, %SVD, eigendecompositions... After reading this page, don't miss our
     \f[ Ax \: = \: b \f]
 Where \a A and \a b are matrices (\a b could be a vector, as a special case). You want to find a solution \a x.
 
-\b The \b solution: You can choose between various decompositions, depending on what your matrix \a A looks like,
+\b The \b solution: You can choose between various decompositions, depending on the properties of your matrix \a A,
 and depending on whether you favor speed or accuracy. However, let's start with an example that works in all cases,
 and is a good compromise:
 <table class="example">
@@ -34,7 +34,7 @@ Vector3f x = dec.solve(b);
 
 Here, ColPivHouseholderQR is a QR decomposition with column pivoting. It's a good compromise for this tutorial, as it
 works for all matrices while being quite fast. Here is a table of some other decompositions that you can choose from,
-depending on your matrix and the trade-off you want to make:
+depending on your matrix, the problem you are trying to solve, and the trade-off you want to make:
 
 <table class="manual">
     <tr>
@@ -128,11 +128,13 @@ depending on your matrix and the trade-off you want to make:
 </table>
 To get an overview of the true relative speed of the different decompositions, check this \link DenseDecompositionBenchmark benchmark \endlink.
 
-All of these decompositions offer a solve() method that works as in the above example.
+All of these decompositions offer a solve() method that works as in the above example. 
 
-For example, if your matrix is positive definite, the above table says that a very good
-choice is then the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
-matrix (not a vector) as right hand side is possible.
+If you know more about the properties of your matrix, you can use the above table to select the best method.
+For example, a good choice for solving linear systems with a non-symmetric matrix of full rank is PartialPivLU.
+If you know that your matrix is also symmetric and positive definite, the above table says that
+a very good choice is the LLT or LDLT decomposition. Here's an example, also demonstrating that using a general
+matrix (not a vector) as right hand side is possible:
 
 <table class="example">
 <tr><th>Example:</th><th>Output:</th></tr>
@@ -146,7 +148,34 @@ For a \ref TopicLinearAlgebraDecompositions "much more complete table" comparing
 supports many other decompositions), see our special page on
 \ref TopicLinearAlgebraDecompositions "this topic".
 
-\section TutorialLinAlgSolutionExists Checking if a solution really exists
+
+\section TutorialLinAlgLeastsquares Least squares solving
+
+The most general and accurate method to solve under- or over-determined linear systems
+in the least squares sense, is the SVD decomposition. Eigen provides two implementations.
+The recommended one is the BDCSVD class, which scales well for large problems
+and automatically falls back to the JacobiSVD class for smaller problems.
+For both classes, their solve() method solved the linear system in the least-squares
+sense. 
+
+Here is an example:
+<table class="example">
+<tr><th>Example:</th><th>Output:</th></tr>
+<tr>
+  <td>\include TutorialLinAlgSVDSolve.cpp </td>
+  <td>\verbinclude TutorialLinAlgSVDSolve.out </td>
+</tr>
+</table>
+
+An alternative to the SVD, which is usually faster and about as accurate, is CompleteOrthogonalDecomposition. 
+
+Again, if you know more about the problem, the table above contains methods that are potentially faster.
+If your matrix is full rank, HouseHolderQR is the method of choice. If your matrix is full rank and well conditioned,
+using the Cholesky decomposition (LLT) on the matrix of the normal equations can be faster still.
+Our page on \link LeastSquares least squares solving \endlink has more details.
+
+
+\section TutorialLinAlgSolutionExists Checking if a matrix is singular
 
 Only you know what error margin you want to allow for a solution to be considered valid.
 So Eigen lets you do this computation for yourself, if you want to, as in this example:
@@ -179,11 +208,11 @@ very rare. The call to info() is to check for this possibility.
 \section TutorialLinAlgInverse Computing inverse and determinant
 
 First of all, make sure that you really want this. While inverse and determinant are fundamental mathematical concepts,
-in \em numerical linear algebra they are not as popular as in pure mathematics. Inverse computations are often
+in \em numerical linear algebra they are not as useful as in pure mathematics. Inverse computations are often
 advantageously replaced by solve() operations, and the determinant is often \em not a good way of checking if a matrix
 is invertible.
 
-However, for \em very \em small matrices, the above is not true, and inverse and determinant can be very useful.
+However, for \em very \em small matrices, the above may not be true, and inverse and determinant can be very useful.
 
 While certain decompositions, such as PartialPivLU and FullPivLU, offer inverse() and determinant() methods, you can also
 call inverse() and determinant() directly on a matrix. If your matrix is of a very small fixed size (at most 4x4) this
@@ -198,28 +227,6 @@ Here is an example:
 </tr>
 </table>
 
-\section TutorialLinAlgLeastsquares Least squares solving
-
-The most accurate method to do least squares solving is with a SVD decomposition.
-Eigen provides two implementations.
-The recommended one is the BDCSVD class, which scale well for large problems
-and automatically fall-back to the JacobiSVD class for smaller problems.
-For both classes, their solve() method is doing least-squares solving.
-
-Here is an example:
-<table class="example">
-<tr><th>Example:</th><th>Output:</th></tr>
-<tr>
-  <td>\include TutorialLinAlgSVDSolve.cpp </td>
-  <td>\verbinclude TutorialLinAlgSVDSolve.out </td>
-</tr>
-</table>
-
-Another methods, potentially faster but less reliable, are to use a Cholesky decomposition of the
-normal matrix or a QR decomposition. Our page on \link LeastSquares least squares solving \endlink
-has more details.
-
-
 \section TutorialLinAlgSeparateComputation Separating the computation from the construction
 
 In the above examples, the decomposition was computed at the same time that the decomposition object was constructed.
diff --git a/doc/TutorialMatrixClass.dox b/doc/TutorialMatrixClass.dox
index 2c452220f433757f0921668064c2f49b0da167ae..9b6b4b1f0ad2a1901005096da3fa44522cc1ac7e 100644
--- a/doc/TutorialMatrixClass.dox
+++ b/doc/TutorialMatrixClass.dox
@@ -111,9 +111,9 @@ Vector4d c(5.0, 6.0, 7.0, 8.0);
 
 If C++11 is enabled, fixed-size column or row vectors of arbitrary size can be initialized by passing an arbitrary number of coefficients:
 \code
-Vector2i a(1, 2);                      // A column vector containing the elements {1, 2}
-Matrix<int, 5, 1> b {1, 2, 3, 4, 5};   // A row-vector containing the elements {1, 2, 3, 4, 5}
-Matrix<int, 1, 5> c = {1, 2, 3, 4, 5}; // A column vector containing the elements {1, 2, 3, 4, 5}
+Vector2i a(1, 2);                      // A column-vector containing the elements {1, 2}
+Matrix<int, 5, 1> b {1, 2, 3, 4, 5};   // A column-vector containing the elements {1, 2, 3, 4, 5}
+Matrix<int, 1, 5> c = {1, 2, 3, 4, 5}; // A row-vector containing the elements {1, 2, 3, 4, 5}
 \endcode
 
 In the general case of matrices and vectors with either fixed or runtime sizes,
diff --git a/doc/TutorialSlicingIndexing.dox b/doc/TutorialSlicingIndexing.dox
index 98ace43e4a5a2cf9e793b0711d3196f1f621638c..60f1edca35269c90065367be384aa7ebb03bc0ad 100644
--- a/doc/TutorialSlicingIndexing.dox
+++ b/doc/TutorialSlicingIndexing.dox
@@ -14,8 +14,8 @@ In particular, it supports \b slicing that consists in taking a set of rows, col
 All the aforementioned operations are handled through the generic DenseBase::operator()(const RowIndices&, const ColIndices&) method.
 Each argument can be:
   - An integer indexing a single row or column, including symbolic indices.
-  - The symbol Eigen::all representing the whole set of respective rows or columns in increasing order.
-  - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::lastN functions.
+  - The symbol Eigen::placeholders::all representing the whole set of respective rows or columns in increasing order.
+  - An ArithmeticSequence as constructed by the Eigen::seq, Eigen::seqN, or Eigen::placeholders::lastN functions.
   - Any 1D vector/array of integers including %Eigen's vector/array, expressions, std::vector, std::array, as well as plain C arrays: `int[N]`.
 
 More generally, it can accepts any object exposing the following two member functions:
@@ -72,7 +72,7 @@ Here are some examples for a 2D array/matrix \c A and a 1D array/vector \c v.
 </tr>
 <tr>
   <td>%Block starting at \c i,j having \c m rows, and \c n columns</td>
-  <td>\code A(seqN(i,m), seqN(i,n) \endcode</td>
+  <td>\code A(seqN(i,m), seqN(i,n)) \endcode</td>
   <td>\code A.block(i,j,m,n) \endcode</td>
 </tr>
 <tr>
@@ -129,12 +129,12 @@ Here comes \link Eigen::lastN(SizeType) Eigen::lastN(size) \endlink, and \link E
 </tr>
 <tr>
   <td>Bottom-right corner of A of size \c m times \c n</td>
-  <td>\code v(lastN(m), lastN(n)) \endcode</td>
+  <td>\code A(lastN(m), lastN(n)) \endcode</td>
   <td>\code A.bottomRightCorner(m,n) \endcode</td>
 </tr>
 <tr>
   <td>Bottom-right corner of A of size \c m times \c n</td>
-  <td>\code v(lastN(m), lastN(n)) \endcode</td>
+  <td>\code A(lastN(m), lastN(n)) \endcode</td>
   <td>\code A.bottomRightCorner(m,n) \endcode</td>
 </tr>
 <tr>
diff --git a/doc/TutorialSparse.dox b/doc/TutorialSparse.dox
index 350ea11396dab3a36eae853b916893bdcdaf52f0..4faba418d095a5edeb0c33383fc1d350b8b2d5c5 100644
--- a/doc/TutorialSparse.dox
+++ b/doc/TutorialSparse.dox
@@ -44,8 +44,8 @@ This storage scheme is better explained on an example. The following matrix
 
 and one of its possible sparse, \b column \b major representation:
 <table class="manual">
-<tr><td>Values:</td>        <td>22</td><td>7</td><td>_</td><td>3</td><td>5</td><td>14</td><td>_</td><td>_</td><td>1</td><td>_</td><td>17</td><td>8</td></tr>
-<tr><td>InnerIndices:</td>  <td> 1</td><td>2</td><td>_</td><td>0</td><td>2</td><td> 4</td><td>_</td><td>_</td><td>2</td><td>_</td><td> 1</td><td>4</td></tr>
+<tr><td>Values:</td>        <td>22</td><td>7</td><td>_</td><td>3</td><td>5</td><td>_</td><td>14</td><td>_</td><td>1</td><td>_</td><td>17</td><td>8</td></tr>
+<tr><td>InnerIndices:</td>  <td> 1</td><td>2</td><td>_</td><td>0</td><td>2</td><td>_</td><td>4</td><td>_</td><td>2</td><td>_</td><td> 1</td><td>4</td></tr>
 </table>
 <table class="manual">
 <tr><td>OuterStarts:</td><td>0</td><td>3</td><td>5</td><td>8</td><td>10</td><td>\em 12 </td></tr>
@@ -60,7 +60,7 @@ On the other hand, inserting elements with increasing inner indices in a given i
 The case where no empty space is available is a special case, and is referred as the \em compressed mode.
 It corresponds to the widely used Compressed Column (or Row) Storage schemes (CCS or CRS).
 Any SparseMatrix can be turned to this form by calling the SparseMatrix::makeCompressed() function.
-In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j].
+In this case, one can remark that the \c InnerNNZs array is redundant with \c OuterStarts because we have the equality: \c InnerNNZs[j] = \c OuterStarts[j+1]-\c OuterStarts[j].
 Therefore, in practice a call to SparseMatrix::makeCompressed() frees this buffer.
 
 It is worth noting that most of our wrappers to external libraries requires compressed matrices as inputs.
diff --git a/doc/examples/matrixfree_cg.cpp b/doc/examples/matrixfree_cg.cpp
index 74699381c7e834159317c19002165d3ff7f357b2..cc0eead13d4b87cf1bf562679e37e4905cff58bf 100644
--- a/doc/examples/matrixfree_cg.cpp
+++ b/doc/examples/matrixfree_cg.cpp
@@ -9,7 +9,7 @@ using Eigen::SparseMatrix;
 
 namespace Eigen {
 namespace internal {
-  // MatrixReplacement looks-like a SparseMatrix, so let's inherits its traits:
+  // MatrixReplacement looks-like a SparseMatrix, so let's inherit its traits:
   template<>
   struct traits<MatrixReplacement> :  public Eigen::internal::traits<Eigen::SparseMatrix<double> >
   {};
@@ -66,7 +66,7 @@ namespace internal {
     {
       // This method should implement "dst += alpha * lhs * rhs" inplace,
       // however, for iterative solvers, alpha is always equal to 1, so let's not bother about it.
-      assert(alpha==Scalar(1) && "scaling is not implemented");
+      eigen_assert(alpha==Scalar(1) && "scaling is not implemented");
       EIGEN_ONLY_USED_FOR_DEBUG(alpha);
 
       // Here we could simply call dst.noalias() += lhs.my_matrix() * rhs,
diff --git a/doc/snippets/ComplexEigenSolver_eigenvectors.cpp b/doc/snippets/ComplexEigenSolver_eigenvectors.cpp
index bb1c2ccf14b48b8fce1c584abf71281c1107104d..adeed9af64e43535ae708f7b1c7fb3dbd8655008 100644
--- a/doc/snippets/ComplexEigenSolver_eigenvectors.cpp
+++ b/doc/snippets/ComplexEigenSolver_eigenvectors.cpp
@@ -1,4 +1,4 @@
 MatrixXcf ones = MatrixXcf::Ones(3,3);
 ComplexEigenSolver<MatrixXcf> ces(ones);
 cout << "The first eigenvector of the 3x3 matrix of ones is:" 
-     << endl << ces.eigenvectors().col(1) << endl;
+     << endl << ces.eigenvectors().col(0) << endl;
diff --git a/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp b/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp
index cfc8b0d54b73ea7984211b7e7238ddc575a4e840..94b0d6ebd3c08f1414ae1ae7b5adf87d0da7d0d4 100644
--- a/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp
+++ b/doc/snippets/SelfAdjointEigenSolver_eigenvectors.cpp
@@ -1,4 +1,4 @@
 MatrixXd ones = MatrixXd::Ones(3,3);
 SelfAdjointEigenSolver<MatrixXd> es(ones);
 cout << "The first eigenvector of the 3x3 matrix of ones is:" 
-     << endl << es.eigenvectors().col(1) << endl;
+     << endl << es.eigenvectors().col(0) << endl;
diff --git a/doc/snippets/Tridiagonalization_decomposeInPlace.cpp b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp
index 93dcfca1d6de1bf0e06419c7575a94d7c329fbfa..9a66baa76d1552435a970c512b500b68ed75a1e4 100644
--- a/doc/snippets/Tridiagonalization_decomposeInPlace.cpp
+++ b/doc/snippets/Tridiagonalization_decomposeInPlace.cpp
@@ -4,7 +4,9 @@ cout << "Here is a random symmetric 5x5 matrix:" << endl << A << endl << endl;
 
 VectorXd diag(5);
 VectorXd subdiag(4);
-internal::tridiagonalization_inplace(A, diag, subdiag, true);
+VectorXd hcoeffs(4);  // Scratch space for householder reflector.
+VectorXd workspace(5);
+internal::tridiagonalization_inplace(A, diag, subdiag, hcoeffs, workspace, true);
 cout << "The orthogonal matrix Q is:" << endl << A << endl;
 cout << "The diagonal of the tridiagonal matrix T is:" << endl << diag << endl;
 cout << "The subdiagonal of the tridiagonal matrix T is:" << endl << subdiag << endl;
diff --git a/doc/snippets/compile_snippet.cpp.in b/doc/snippets/compile_snippet.cpp.in
index c11457a3f576df2555fa5d45fbae5a08c29962a5..04f276d0bd0c639475faabe8efe6d87381f1ce16 100644
--- a/doc/snippets/compile_snippet.cpp.in
+++ b/doc/snippets/compile_snippet.cpp.in
@@ -2,6 +2,7 @@ static bool eigen_did_assert = false;
 #define eigen_assert(X) if(!eigen_did_assert && !(X)){ std::cout << "### Assertion raised in " << __FILE__ << ":" << __LINE__ << ":\n" #X << "\n### The following would happen without assertions:\n"; eigen_did_assert = true;}
 
 #include <iostream>
+#include <cassert>
 #include <Eigen/Eigen>
 
 #ifndef M_PI
diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt
index 9eec810761e3b85605648c26725f290d8e75539c..8d6d75401690da7e1baacad04ed69bef541ffa44 100644
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -1,10 +1,18 @@
-
 project(EigenLapack CXX)
 
+if(EIGEN_BUILD_LAPACK AND EIGEN_BUILD_BLAS)
+
 include(CheckLanguage)
 check_language(Fortran)
 if(CMAKE_Fortran_COMPILER)
   enable_language(Fortran)
+  if("${CMAKE_Fortran_COMPILER_ID}" STREQUAL "GNU")
+    if ("${CMAKE_Fortran_COMPILER_VERSION}" VERSION_GREATER_EQUAL 10.0)
+      # We use an old version of LAPACK with argument type mismatches.
+      # Allow them to compile anyway with newer GNU versions.
+      set(CMAKE_Fortran_FLAGS  "${CMAKE_Fortran_FLAGS} -fallow-argument-mismatch")
+    endif()
+  endif()
   set(EIGEN_Fortran_COMPILER_WORKS ON)
 else()
   set(EIGEN_Fortran_COMPILER_WORKS OFF)
@@ -88,25 +96,29 @@ endif()
 
 endif()
 
-add_library(eigen_lapack_static ${EigenLapack_SRCS} ${ReferenceLapack_SRCS})
-add_library(eigen_lapack SHARED ${EigenLapack_SRCS})
+set(EIGEN_LAPACK_TARGETS "")
 
-target_link_libraries(eigen_lapack  eigen_blas)
+add_library(eigen_lapack_static ${EigenLapack_SRCS} ${ReferenceLapack_SRCS})
+list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack_static)
 
-if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
-  target_link_libraries(eigen_lapack_static ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
-  target_link_libraries(eigen_lapack        ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+if (EIGEN_BUILD_SHARED_LIBS)
+  add_library(eigen_lapack SHARED ${EigenLapack_SRCS})
+  list(APPEND EIGEN_LAPACK_TARGETS eigen_lapack)
+  target_link_libraries(eigen_lapack  eigen_blas)
 endif()
 
-add_dependencies(lapack eigen_lapack eigen_lapack_static)
+foreach(target IN LISTS EIGEN_LAPACK_TARGETS)
+  if(EIGEN_STANDARD_LIBRARIES_TO_LINK_TO)
+    target_link_libraries(${target} ${EIGEN_STANDARD_LIBRARIES_TO_LINK_TO})
+  endif()
+  add_dependencies(lapack ${target})
+  install(TARGETS ${target}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib)
+endforeach()
 
-install(TARGETS eigen_lapack eigen_lapack_static
-        RUNTIME DESTINATION bin
-        LIBRARY DESTINATION lib
-        ARCHIVE DESTINATION lib)
 
-        
-        
 get_filename_component(eigen_full_path_to_testing_lapack "./testing/" ABSOLUTE)
 if(EXISTS ${eigen_full_path_to_testing_lapack})
   
@@ -141,6 +153,7 @@ if(EXISTS ${eigen_full_path_to_testing_lapack})
     string(REPLACE "." "_" input_name ${input})
     set(testName "${target}_${input_name}")
     if(EXISTS "${TEST_INPUT}")
+      add_dependencies(buildtests ${target})
       add_test(NAME LAPACK-${testName}
         COMMAND "${CMAKE_COMMAND}"
         -DTEST=$<TARGET_FILE:${target}>
@@ -446,3 +459,6 @@ if(EXISTS ${eigen_full_path_to_testing_lapack})
 
 endif()
 
+elseif(EIGEN_BUILD_LAPACK AND NOT EIGEN_BUILD_BLAS)
+ message(FATAL_ERROR "EIGEN_BUILD_LAPACK requires EIGEN_BUILD_BLAS")
+endif() #EIGEN_BUILD_LAPACK
diff --git a/test/AnnoyingScalar.h b/test/AnnoyingScalar.h
index 0f8e70d360eba9ec6b1031b9fd689af909ca08e9..b621887275df49477656d23da0e31432bf65010f 100644
--- a/test/AnnoyingScalar.h
+++ b/test/AnnoyingScalar.h
@@ -126,7 +126,7 @@ template<>
 struct NumTraits<AnnoyingScalar> : NumTraits<float>
 {
   enum {
-    RequireInitialization = true
+    RequireInitialization = 1
   };
   typedef AnnoyingScalar Real;
   typedef AnnoyingScalar Nested;
@@ -145,10 +145,6 @@ bool (isfinite)(const AnnoyingScalar& x) {
 }
 
 namespace internal {
-  template<> EIGEN_STRONG_INLINE AnnoyingScalar pcmp_eq(const AnnoyingScalar& a, const AnnoyingScalar& b)
-  { return AnnoyingScalar(pcmp_eq(*a.v, *b.v)); }
-  template<> EIGEN_STRONG_INLINE AnnoyingScalar pselect(const AnnoyingScalar& mask, const AnnoyingScalar& a, const AnnoyingScalar& b)
-  { return numext::equal_strict(*mask.v, 0.f) ? b : a; }
   template<> EIGEN_STRONG_INLINE double cast(const AnnoyingScalar& x) { return double(*x.v); }
   template<> EIGEN_STRONG_INLINE float  cast(const AnnoyingScalar& x) { return *x.v; }
 }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 56664e783ecde7cb24f408f36ade2d9be0b1820c..dbd4bc618c23afa77a958c7e2562cce436319bc1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -42,45 +42,53 @@ endif()
 set(SPARSE_LIBS " ")
 
 find_package(CHOLMOD)
-if(CHOLMOD_FOUND)
+if(CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK)
   add_definitions("-DEIGEN_CHOLMOD_SUPPORT")
   include_directories(${CHOLMOD_INCLUDES})
   set(SPARSE_LIBS ${SPARSE_LIBS} ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
   set(CHOLMOD_ALL_LIBS  ${CHOLMOD_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS "CHOLMOD, ")
+
+  ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "CHOLMOD, ")
 endif()
 
 find_package(UMFPACK)
-if(UMFPACK_FOUND)
+if(UMFPACK_FOUND AND EIGEN_BUILD_BLAS)
   add_definitions("-DEIGEN_UMFPACK_SUPPORT")
   include_directories(${UMFPACK_INCLUDES})
   set(SPARSE_LIBS ${SPARSE_LIBS} ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   set(UMFPACK_ALL_LIBS ${UMFPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS "UMFPACK, ")
+
+  ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "UMFPACK, ")
 endif()
 
 find_package(KLU)
-if(KLU_FOUND)
+if(KLU_FOUND AND EIGEN_BUILD_BLAS)
   add_definitions("-DEIGEN_KLU_SUPPORT")
   include_directories(${KLU_INCLUDES})
   set(SPARSE_LIBS ${SPARSE_LIBS} ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   set(KLU_ALL_LIBS ${KLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS "KLU, ")
+
+  ei_add_test(klu_support "" "${KLU_ALL_LIBS}")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "KLU, ")
 endif()
 
 find_package(SuperLU 4.0)
-if(SuperLU_FOUND)
+if(SuperLU_FOUND AND EIGEN_BUILD_BLAS)
   add_definitions("-DEIGEN_SUPERLU_SUPPORT")
   include_directories(${SUPERLU_INCLUDES})
   set(SPARSE_LIBS ${SPARSE_LIBS} ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   set(SUPERLU_ALL_LIBS ${SUPERLU_LIBRARIES} ${EIGEN_BLAS_LIBRARIES})
   ei_add_property(EIGEN_TESTED_BACKENDS  "SuperLU, ")
+
+  ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}")
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS  "SuperLU, ")
 endif()
@@ -124,7 +132,7 @@ else()
 endif()
 
 find_package(SPQR)
-if(SPQR_FOUND AND CHOLMOD_FOUND AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) )
+if(SPQR_FOUND AND CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK AND (EIGEN_Fortran_COMPILER_WORKS OR LAPACK_FOUND) )
   add_definitions("-DEIGEN_SPQR_SUPPORT")
   include_directories(${SPQR_INCLUDES})
   set(SPQR_ALL_LIBS ${SPQR_LIBRARIES} ${CHOLMOD_LIBRARIES} ${EIGEN_LAPACK_LIBRARIES} ${EIGEN_BLAS_LIBRARIES} ${LAPACK_LIBRARIES})
@@ -152,6 +160,7 @@ endif()
 set_property(GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT "Official")
 add_custom_target(BuildOfficial)
 
+ei_add_test(clz)
 ei_add_test(rand)
 ei_add_test(meta)
 ei_add_test(numext)
@@ -164,7 +173,6 @@ ei_add_test(nullary)
 ei_add_test(mixingtypes)
 ei_add_test(io)
 ei_add_test(packetmath "-DEIGEN_FAST_MATH=1")
-ei_add_test(unalignedassert)
 ei_add_test(vectorization_logic)
 ei_add_test(basicstuff)
 ei_add_test(constructor)
@@ -311,22 +319,6 @@ if(QT4_FOUND)
   ei_add_test(qtvector "" "${QT_QTCORE_LIBRARY}")
 endif()
 
-if(UMFPACK_FOUND)
-  ei_add_test(umfpack_support "" "${UMFPACK_ALL_LIBS}")
-endif()
-
-if(KLU_FOUND OR SuiteSparse_FOUND)
-  ei_add_test(klu_support "" "${KLU_ALL_LIBS}")
-endif()
-
-if(SUPERLU_FOUND)
-  ei_add_test(superlu_support "" "${SUPERLU_ALL_LIBS}")
-endif()
-
-if(CHOLMOD_FOUND)
-  ei_add_test(cholmod_support "" "${CHOLMOD_ALL_LIBS}")
-endif()
-
 if(PARDISO_FOUND)
   ei_add_test(pardiso_support "" "${PARDISO_ALL_LIBS}")
 endif()
@@ -335,7 +327,7 @@ if(PASTIX_FOUND AND (SCOTCH_FOUND OR METIS_FOUND))
   ei_add_test(pastix_support "" "${PASTIX_ALL_LIBS}")
 endif()
 
-if(SPQR_FOUND AND CHOLMOD_FOUND)
+if(SPQR_FOUND AND CHOLMOD_FOUND AND EIGEN_BUILD_BLAS AND EIGEN_BUILD_LAPACK)
   ei_add_test(spqr_support "" "${SPQR_ALL_LIBS}")
 endif()
 
@@ -384,40 +376,38 @@ if(EIGEN_TEST_CUDA_CLANG AND NOT CMAKE_CXX_COMPILER MATCHES "clang")
   message(WARNING "EIGEN_TEST_CUDA_CLANG is set, but CMAKE_CXX_COMPILER does not appear to be clang.")
 endif()
 
-if(EIGEN_TEST_CUDA)
+find_package(CUDA 9.0)
+if(CUDA_FOUND AND EIGEN_TEST_CUDA)
+  # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
+  # and -fno-check-new flags since they trigger thousands of compilation warnings
+  # in the CUDA runtime
+  string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
-find_package(CUDA 5.0)
-if(CUDA_FOUND)
-  
-  set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-  
-  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
-  if (${CUDA_VERSION} STREQUAL "7.0")
-    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
-  endif()
-  
-  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 
-    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
-  endif()
   if(EIGEN_TEST_CUDA_CLANG)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
     string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
     foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
       string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
     endforeach()
+    string(APPEND CMAKE_CXX_FLAGS " ${EIGEN_CUDA_CXX_FLAGS}")
   else()
-    foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
-      string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}")
+    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+    set(NVCC_ARCH_FLAGS)
+    foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+      string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
     endforeach()
+    set(CUDA_NVCC_FLAGS  "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}")
+    cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   endif()
-  string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
+
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
-  
+
   ei_add_test(gpu_basic)
-  
-  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 
-endif()
+  unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 
 endif()
 
@@ -429,8 +419,8 @@ if (EIGEN_TEST_HIP)
   set(HIP_PATH "/opt/rocm/hip" CACHE STRING "Path to the HIP installation.")
 
   if (EXISTS ${HIP_PATH})
-    
-    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake) 
+
+    list(APPEND CMAKE_MODULE_PATH ${HIP_PATH}/cmake)
 
     find_package(HIP REQUIRED)
     if (HIP_FOUND)
@@ -444,12 +434,12 @@ if (EIGEN_TEST_HIP)
 	set(EIGEN_ADD_TEST_FILENAME_EXTENSION  "cu")
 	ei_add_test(gpu_basic)
 	unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
-	
+
       elseif ((${HIP_PLATFORM} STREQUAL "nvcc") OR (${HIP_PLATFORM} STREQUAL "nvidia"))
 	message(FATAL_ERROR "HIP_PLATFORM = nvcc is not supported within Eigen")
       else ()
 	message(FATAL_ERROR "Unknown HIP_PLATFORM = ${HIP_PLATFORM}")
-      endif() 
+      endif()
     endif()
   else ()
     message(FATAL_ERROR "EIGEN_TEST_HIP is ON, but the specified HIP_PATH (${HIP_PATH}) does not exist")
diff --git a/test/SafeScalar.h b/test/SafeScalar.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5cb75717c7c0131814ee98ceb840ac83b48dd26
--- /dev/null
+++ b/test/SafeScalar.h
@@ -0,0 +1,30 @@
+
+// A Scalar that asserts for uninitialized access.
+template<typename T>
+class SafeScalar {
+ public:
+  SafeScalar() : initialized_(false) {}
+  SafeScalar(const SafeScalar& other) {
+    *this = other;
+  }
+  SafeScalar& operator=(const SafeScalar& other) {
+    val_ = T(other);
+    initialized_ = true;
+    return *this;
+  }
+  
+  SafeScalar(T val) : val_(val), initialized_(true) {}
+  SafeScalar& operator=(T val) {
+    val_ = val;
+    initialized_ = true;
+  }
+  
+  operator T() const {
+    VERIFY(initialized_ && "Uninitialized access.");
+    return val_;
+  }
+ 
+ private:
+  T val_;
+  bool initialized_;
+};
diff --git a/test/array_cwise.cpp b/test/array_cwise.cpp
index 1bc8e19f91a915c7a274015a4814a40455546a75..2388830902ddb5df4a6952b07e1a8f84d4918ffd 100644
--- a/test/array_cwise.cpp
+++ b/test/array_cwise.cpp
@@ -72,9 +72,9 @@ void pow_test() {
     for (int j = 0; j < num_cases; ++j) {
       Scalar e = static_cast<Scalar>(std::pow(x(i,j), y(i,j)));
       Scalar a = actual(i, j);
-      bool fail = !(a==e) && !internal::isApprox(a, e, tol) && !((numext::isnan)(a) && (numext::isnan)(e));
-      all_pass &= !fail;
-      if (fail) {
+      bool success = (a==e) || ((numext::isfinite)(e) && internal::isApprox(a, e, tol)) || ((numext::isnan)(a) && (numext::isnan)(e));
+      all_pass &= success;
+      if (!success) {
         std::cout << "pow(" << x(i,j) << "," << y(i,j) << ")   =   " << a << " !=  " << e << std::endl;
       }
     }
@@ -626,6 +626,41 @@ template<typename ArrayType> void min_max(const ArrayType& m)
   }
 }
 
+template<int N>
+struct shift_left {
+  template<typename Scalar>
+  Scalar operator()(const Scalar& v) const {
+    return v << N;
+  }
+};
+
+template<int N>
+struct arithmetic_shift_right {
+  template<typename Scalar>
+  Scalar operator()(const Scalar& v) const {
+    return v >> N;
+  }
+};
+
+template<typename ArrayType> void array_integer(const ArrayType& m)
+{
+  Index rows = m.rows();
+  Index cols = m.cols();
+
+  ArrayType m1 = ArrayType::Random(rows, cols),
+            m2(rows, cols);
+
+  m2 = m1.template shiftLeft<2>();
+  VERIFY( (m2 == m1.unaryExpr(shift_left<2>())).all() );
+  m2 = m1.template shiftLeft<9>();
+  VERIFY( (m2 == m1.unaryExpr(shift_left<9>())).all() );
+  
+  m2 = m1.template shiftRight<2>();
+  VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<2>())).all() );
+  m2 = m1.template shiftRight<9>();
+  VERIFY( (m2 == m1.unaryExpr(arithmetic_shift_right<9>())).all() );
+}
+
 EIGEN_DECLARE_TEST(array_cwise)
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -636,6 +671,8 @@ EIGEN_DECLARE_TEST(array_cwise)
     CALL_SUBTEST_5( array(ArrayXXf(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_6( array(ArrayXXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
     CALL_SUBTEST_6( array(Array<Index,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array_integer(ArrayXXi(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
+    CALL_SUBTEST_6( array_integer(Array<Index,Dynamic,Dynamic>(internal::random<int>(1,EIGEN_TEST_MAX_SIZE), internal::random<int>(1,EIGEN_TEST_MAX_SIZE))) );
   }
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( comparisons(Array<float, 1, 1>()) );
diff --git a/test/array_for_matrix.cpp b/test/array_for_matrix.cpp
index fb6be351e69af963db61c903ea0e9f6e3c6d52da..06e04a2fa04fbd8d94c3427d04f770438538ca55 100644
--- a/test/array_for_matrix.cpp
+++ b/test/array_for_matrix.cpp
@@ -211,6 +211,40 @@ template<typename MatrixType> void cwise_min_max(const MatrixType& m)
   VERIFY_IS_APPROX(MatrixType::Constant(rows,cols, maxM1).array(), (m1.array().max)( maxM1));
   VERIFY_IS_APPROX(m1.array(), (m1.array().max)( minM1));
 
+  // Test NaN propagation for min/max.
+  if (!NumTraits<Scalar>::IsInteger) {
+    m1(0,0) = NumTraits<Scalar>::quiet_NaN();
+    // Elementwise.
+    VERIFY((numext::isnan)(m1.template cwiseMax<PropagateNaN>(MatrixType::Constant(rows,cols, Scalar(1)))(0,0)));
+    VERIFY((numext::isnan)(m1.template cwiseMin<PropagateNaN>(MatrixType::Constant(rows,cols, Scalar(1)))(0,0)));
+    VERIFY(!(numext::isnan)(m1.template cwiseMax<PropagateNumbers>(MatrixType::Constant(rows,cols, Scalar(1)))(0,0)));
+    VERIFY(!(numext::isnan)(m1.template cwiseMin<PropagateNumbers>(MatrixType::Constant(rows,cols, Scalar(1)))(0,0)));
+    VERIFY((numext::isnan)(m1.template cwiseMax<PropagateNaN>(Scalar(1))(0,0)));
+    VERIFY((numext::isnan)(m1.template cwiseMin<PropagateNaN>(Scalar(1))(0,0)));
+    VERIFY(!(numext::isnan)(m1.template cwiseMax<PropagateNumbers>(Scalar(1))(0,0)));
+    VERIFY(!(numext::isnan)(m1.template cwiseMin<PropagateNumbers>(Scalar(1))(0,0)));
+
+
+    VERIFY((numext::isnan)(m1.array().template max<PropagateNaN>(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0)));
+    VERIFY((numext::isnan)(m1.array().template min<PropagateNaN>(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0)));
+    VERIFY(!(numext::isnan)(m1.array().template max<PropagateNumbers>(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0)));
+    VERIFY(!(numext::isnan)(m1.array().template min<PropagateNumbers>(MatrixType::Constant(rows,cols, Scalar(1)).array())(0,0)));
+    VERIFY((numext::isnan)(m1.array().template max<PropagateNaN>(Scalar(1))(0,0)));
+    VERIFY((numext::isnan)(m1.array().template min<PropagateNaN>(Scalar(1))(0,0)));
+    VERIFY(!(numext::isnan)(m1.array().template max<PropagateNumbers>(Scalar(1))(0,0)));
+    VERIFY(!(numext::isnan)(m1.array().template min<PropagateNumbers>(Scalar(1))(0,0)));
+
+    // Reductions.
+    VERIFY((numext::isnan)(m1.template maxCoeff<PropagateNaN>()));
+    VERIFY((numext::isnan)(m1.template minCoeff<PropagateNaN>()));
+    if (m1.size() > 1) {
+      VERIFY(!(numext::isnan)(m1.template maxCoeff<PropagateNumbers>()));
+      VERIFY(!(numext::isnan)(m1.template minCoeff<PropagateNumbers>()));
+    } else {
+      VERIFY((numext::isnan)(m1.template maxCoeff<PropagateNumbers>()));
+      VERIFY((numext::isnan)(m1.template minCoeff<PropagateNumbers>()));
+    }
+  }
 }
 
 template<typename MatrixTraits> void resize(const MatrixTraits& t)
diff --git a/test/bdcsvd.cpp b/test/bdcsvd.cpp
index e92a7dc97d185d7b2e67992b09435f01f1a08d31..41303775c74cfa9c0eaa8793a6e57d52d0e7040e 100644
--- a/test/bdcsvd.cpp
+++ b/test/bdcsvd.cpp
@@ -54,20 +54,46 @@ void bdcsvd_method()
   VERIFY_IS_APPROX(m.bdcSvd(ComputeFullU|ComputeFullV).adjoint().solve(m), m);
 }
 
-// compare the Singular values returned with Jacobi and Bdc
+// Compare the Singular values returned with Jacobi and Bdc.
 template<typename MatrixType> 
-void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0)
+void compare_bdc_jacobi(const MatrixType& a = MatrixType(), unsigned int computationOptions = 0, int algoswap = 16, bool random = true)
 {
-  MatrixType m = MatrixType::Random(a.rows(), a.cols());
-  BDCSVD<MatrixType> bdc_svd(m);
+  MatrixType m = random ? MatrixType::Random(a.rows(), a.cols()) : a;
+
+  BDCSVD<MatrixType> bdc_svd(m.rows(), m.cols(), computationOptions);
+  bdc_svd.setSwitchSize(algoswap);
+  bdc_svd.compute(m);
+
   JacobiSVD<MatrixType> jacobi_svd(m);
   VERIFY_IS_APPROX(bdc_svd.singularValues(), jacobi_svd.singularValues());
+
   if(computationOptions & ComputeFullU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
   if(computationOptions & ComputeThinU) VERIFY_IS_APPROX(bdc_svd.matrixU(), jacobi_svd.matrixU());
   if(computationOptions & ComputeFullV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
   if(computationOptions & ComputeThinV) VERIFY_IS_APPROX(bdc_svd.matrixV(), jacobi_svd.matrixV());
 }
 
+// Verifies total deflation is **not** triggered.
+void compare_bdc_jacobi_instance(bool structure_as_m, int algoswap = 16)
+{
+  MatrixXd m(4, 3);
+  if (structure_as_m) {
+    // The first 3 rows are the reduced form of Matrix 1 as shown below, and it
+    // has nonzero elements in the first column and diagonals only.
+    m << 1.056293, 0, 0,
+         -0.336468, 0.907359, 0,
+         -1.566245, 0, 0.149150,
+         -0.1, 0, 0;
+  } else {
+    // Matrix 1.
+    m << 0.882336, 18.3914, -26.7921,
+         -5.58135, 17.1931, -24.0892,
+         -20.794, 8.68496, -4.83103,
+         -8.4981, -10.5451, 23.9072;
+  }
+  compare_bdc_jacobi(m, 0, algoswap, false);
+}
+
 EIGEN_DECLARE_TEST(bdcsvd)
 {
   CALL_SUBTEST_3(( svd_verify_assert<BDCSVD<Matrix3f>  >(Matrix3f()) ));
@@ -114,5 +140,13 @@ EIGEN_DECLARE_TEST(bdcsvd)
   // CALL_SUBTEST_9( svd_preallocate<void>() );
 
   CALL_SUBTEST_2( svd_underoverflow<void>() );
+
+  // Without total deflation issues.
+  CALL_SUBTEST_11((  compare_bdc_jacobi_instance(true) ));
+  CALL_SUBTEST_12((  compare_bdc_jacobi_instance(false) ));
+
+  // With total deflation issues before, when it shouldn't be triggered.
+  CALL_SUBTEST_13((  compare_bdc_jacobi_instance(true, 3) ));
+  CALL_SUBTEST_14((  compare_bdc_jacobi_instance(false, 3) ));
 }
 
diff --git a/test/bfloat16_float.cpp b/test/bfloat16_float.cpp
index 1df22f73e5cad2ee749920c69ebe3004828373c7..c3de0b19a0ce6aaa8b65140dd2cebd88e33a1de1 100644
--- a/test/bfloat16_float.cpp
+++ b/test/bfloat16_float.cpp
@@ -32,18 +32,6 @@ float BinaryToFloat(uint32_t sign, uint32_t exponent, uint32_t high_mantissa,
   return dest;
 }
 
-void test_truncate(float input, float expected_truncation, float expected_rounding){
-  bfloat16 truncated = Eigen::bfloat16_impl::truncate_to_bfloat16(input);
-  bfloat16 rounded = Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(input);
-  if ((numext::isnan)(input)){
-    VERIFY((numext::isnan)(static_cast<float>(truncated)) || (numext::isinf)(static_cast<float>(truncated)));
-    VERIFY((numext::isnan)(static_cast<float>(rounded)) || (numext::isinf)(static_cast<float>(rounded)));
-    return;
-  }
-  VERIFY_IS_EQUAL(expected_truncation, static_cast<float>(truncated));
-  VERIFY_IS_EQUAL(expected_rounding, static_cast<float>(rounded));
-}
-
 template<typename T>
  void test_roundtrip() {
   // Representable T round trip via bfloat16
@@ -122,31 +110,6 @@ void test_conversion()
   VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(0.0f), 0x0000);
   VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(-0.0f), 0x8000);
 
-  // Flush denormals to zero
-  for (float denorm = -std::numeric_limits<float>::denorm_min();
-       denorm < std::numeric_limits<float>::denorm_min();
-       denorm = nextafterf(denorm, 1.0f)) {
-    bfloat16 bf_trunc = Eigen::bfloat16_impl::truncate_to_bfloat16(denorm);
-    VERIFY_IS_EQUAL(static_cast<float>(bf_trunc), 0.0f);
-
-    // Implicit conversion of denormls to bool is correct
-    VERIFY_IS_EQUAL(static_cast<bool>(bfloat16(denorm)), false);
-    VERIFY_IS_EQUAL(bfloat16(denorm), false);
-
-    if (std::signbit(denorm)) {
-      VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x8000);
-    } else {
-      VERIFY_BFLOAT16_BITS_EQUAL(bf_trunc, 0x0000);
-    }
-    bfloat16 bf_round = Eigen::bfloat16_impl::float_to_bfloat16_rtne<false>(denorm);
-    VERIFY_IS_EQUAL(static_cast<float>(bf_round), 0.0f);
-    if (std::signbit(denorm)) {
-      VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x8000);
-    } else {
-      VERIFY_BFLOAT16_BITS_EQUAL(bf_round, 0x0000);
-    }
-  }
-
   // Default is zero
   VERIFY_IS_EQUAL(static_cast<float>(bfloat16()), 0.0f);
 
@@ -156,52 +119,6 @@ void test_conversion()
   test_roundtrip<std::complex<float> >();
   test_roundtrip<std::complex<double> >();
 
-  // Truncate test
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0xf5c3),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x49, 0x0000));
-  test_truncate(
-      BinaryToFloat(1, 0x80, 0x48, 0xf5c3),
-      BinaryToFloat(1, 0x80, 0x48, 0x0000),
-      BinaryToFloat(1, 0x80, 0x49, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0x8000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0xff, 0x00, 0x0001),
-      BinaryToFloat(0, 0xff, 0x40, 0x0000),
-      BinaryToFloat(0, 0xff, 0x40, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0xff, 0x7f, 0xffff),
-      BinaryToFloat(0, 0xff, 0x40, 0x0000),
-      BinaryToFloat(0, 0xff, 0x40, 0x0000));
-  test_truncate(
-      BinaryToFloat(1, 0x80, 0x48, 0xc000),
-      BinaryToFloat(1, 0x80, 0x48, 0x0000),
-      BinaryToFloat(1, 0x80, 0x49, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0x4000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x80, 0x48, 0x8000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000),
-      BinaryToFloat(0, 0x80, 0x48, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x00, 0x48, 0x8000),
-      BinaryToFloat(0, 0x00, 0x00, 0x0000),
-      BinaryToFloat(0, 0x00, 0x00, 0x0000));
-  test_truncate(
-      BinaryToFloat(0, 0x00, 0x7f, 0xc000),
-      BinaryToFloat(0, 0x00, 0x00, 0x0000),
-      BinaryToFloat(0, 0x00, 0x00, 0x0000));
-
   // Conversion
   Array<float,1,100> a;
   for (int i = 0; i < 100; i++) a(i) = i + 1.25;
@@ -250,12 +167,6 @@ void test_conversion()
 
   VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x0, 0xff, 0x40, 0x0)), 0x7fc0);
   VERIFY_BFLOAT16_BITS_EQUAL(bfloat16(BinaryToFloat(0x1, 0xff, 0x40, 0x0)), 0xffc0);
-  VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
-                               BinaryToFloat(0x0, 0xff, 0x40, 0x0)),
-                             0x7fc0);
-  VERIFY_BFLOAT16_BITS_EQUAL(Eigen::bfloat16_impl::truncate_to_bfloat16(
-                               BinaryToFloat(0x1, 0xff, 0x40, 0x0)),
-                             0xffc0);
 }
 
 void test_numtraits()
diff --git a/test/block.cpp b/test/block.cpp
index 84124aba642b38521d864a5b130ac3cab8b86684..667a3be3914308978bcb94fa49971e65fbbd326e 100644
--- a/test/block.cpp
+++ b/test/block.cpp
@@ -143,11 +143,12 @@ template<typename MatrixType> void block(const MatrixType& m)
   
   // check that linear acccessors works on blocks
   m1 = m1_copy;
-  if((MatrixType::Flags&RowMajorBit)==0)
-    VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1+c1*rows), m1(r1,c1));
-  else
-    VERIFY_IS_EQUAL(m1.topRows(r1).coeff(c1+r1*cols), m1(r1,c1));
-  
+  if (c1 > 0 && r1 > 0) {
+    if ((MatrixType::Flags & RowMajorBit) == 0)
+      VERIFY_IS_EQUAL(m1.leftCols(c1).coeff(r1 + c1 * rows), m1(r1, c1));
+    else
+      VERIFY_IS_EQUAL(m1.topRows(r1).coeff(c1 + r1 * cols), m1(r1, c1));
+  }
 
   // now test some block-inside-of-block.
   
diff --git a/test/boostmultiprec.cpp b/test/boostmultiprec.cpp
index 7c79ded23bc49639ded9cda751c10bdae0fe0ee0..e83e9704479c9a354677f60f0547d464e9b4ea4c 100644
--- a/test/boostmultiprec.cpp
+++ b/test/boostmultiprec.cpp
@@ -74,8 +74,7 @@
 #include <boost/math/special_functions.hpp>
 #include <boost/math/complex.hpp>
 
-namespace mp = boost::multiprecision;
-typedef mp::number<mp::cpp_dec_float<100>, mp::et_on> Real;
+typedef boost::multiprecision::number<boost::multiprecision::cpp_dec_float<100>, boost::multiprecision::et_on> Real;
 
 namespace Eigen {
   template<> struct NumTraits<Real> : GenericNumTraits<Real> {
diff --git a/test/clz.cpp b/test/clz.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d08b47155c8771731154e4687fdf40435bc7fff
--- /dev/null
+++ b/test/clz.cpp
@@ -0,0 +1,74 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 The Eigen Authors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+template <typename T>
+int ref_clz(T val) {
+  static const int kNumBits = sizeof(T) * CHAR_BIT;
+  T kMsbMask = T(1) << (kNumBits - 1);
+  int z = 0;
+  for (; z < kNumBits && ((val & kMsbMask) == 0); ++z) {
+    val <<= 1;
+  }
+  return z;
+}
+
+template <typename T>
+int ref_ctz(T val) {
+  static const int kNumBits = sizeof(T) * CHAR_BIT;
+  T kLsbMask = T(1);
+  int z = 0;
+  for (; z < kNumBits && ((val & kLsbMask) == 0); ++z) {
+    val >>= 1;
+  }
+  return z;
+}
+
+template <typename T>
+void test_clz_ctz() {
+  T step = sizeof(T) <= 2 ? 1 : (Eigen::NumTraits<T>::highest() / (T(1) << 16));
+  T iters = Eigen::NumTraits<T>::highest() / step;
+  for (T i = 0; i < iters; ++i) {
+    T val = i * step;
+    int expected_clz = ref_clz(val);
+    int actual_clz = Eigen::internal::clz(val);
+    VERIFY(expected_clz == actual_clz);
+
+    int expected_ctz = ref_ctz(val);
+    int actual_ctz = Eigen::internal::ctz(val);
+    VERIFY(expected_ctz == actual_ctz);
+  }
+}
+
+template <typename T>
+void test_clz_ctz_random() {
+  for (int i = 0; i < 1024 * 1024; ++i) {
+    T val = Eigen::internal::random<T>();
+    int expected_clz = ref_clz(val);
+    int actual_clz = Eigen::internal::clz(val);
+    VERIFY(expected_clz == actual_clz);
+
+    int expected_ctz = ref_ctz(val);
+    int actual_ctz = Eigen::internal::ctz(val);
+    VERIFY(expected_ctz == actual_ctz);
+  }
+}
+
+EIGEN_DECLARE_TEST(clz) {
+  CALL_SUBTEST_1(test_clz_ctz<uint8_t>());
+  CALL_SUBTEST_2(test_clz_ctz<uint16_t>());
+  CALL_SUBTEST_3(test_clz_ctz<uint32_t>());
+  CALL_SUBTEST_4(test_clz_ctz<uint64_t>());
+
+  for (int i = 0; i < g_repeat; i++) {
+    test_clz_ctz_random<uint32_t>();
+    test_clz_ctz_random<uint64_t>();
+  }
+}
diff --git a/test/conservative_resize.cpp b/test/conservative_resize.cpp
index 5dc500068c232cfe5401ed7f5a757308cbbb5b05..d48eb126fdb1fca779981e258f0c82430c1a01d1 100644
--- a/test/conservative_resize.cpp
+++ b/test/conservative_resize.cpp
@@ -115,9 +115,11 @@ template<int> void noncopyable()
 {
   typedef Eigen::Matrix<AnnoyingScalar,Dynamic,1> VectorType;
   typedef Eigen::Matrix<AnnoyingScalar,Dynamic,Dynamic> MatrixType;
-  
+
   {
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
     AnnoyingScalar::dont_throw = true;
+#endif
     int n = 50;
     VectorType v0(n), v1(n);
     MatrixType m0(n,n), m1(n,n), m2(n,n);
@@ -148,6 +150,7 @@ EIGEN_DECLARE_TEST(conservative_resize)
     CALL_SUBTEST_4((run_matrix_tests<std::complex<float>, Eigen::ColMajor>()));
     CALL_SUBTEST_5((run_matrix_tests<std::complex<double>, Eigen::RowMajor>()));
     CALL_SUBTEST_5((run_matrix_tests<std::complex<double>, Eigen::ColMajor>()));
+    CALL_SUBTEST_1((run_matrix_tests<int, Eigen::RowMajor | Eigen::DontAlign>()));
 
     CALL_SUBTEST_1((run_vector_tests<int>()));
     CALL_SUBTEST_2((run_vector_tests<float>()));
@@ -155,7 +158,9 @@ EIGEN_DECLARE_TEST(conservative_resize)
     CALL_SUBTEST_4((run_vector_tests<std::complex<float> >()));
     CALL_SUBTEST_5((run_vector_tests<std::complex<double> >()));
 
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
     AnnoyingScalar::dont_throw = true;
+#endif
     CALL_SUBTEST_6(( run_vector_tests<AnnoyingScalar>() ));
     CALL_SUBTEST_6(( noncopyable<0>() ));
   }
diff --git a/test/dense_storage.cpp b/test/dense_storage.cpp
index 7fa25859dded034adcabba0ed0e151cfc6fbb2cb..45c2bd728a993ee4747901536aea2876cbf18126 100644
--- a/test/dense_storage.cpp
+++ b/test/dense_storage.cpp
@@ -8,17 +8,27 @@
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #include "main.h"
+#include "AnnoyingScalar.h"
+#include "SafeScalar.h"
 
 #include <Eigen/Core>
 
-template <typename T, int Rows, int Cols>
-void dense_storage_copy()
+#if EIGEN_HAS_TYPE_TRAITS && EIGEN_HAS_CXX11
+using DenseStorageD3x3 = Eigen::DenseStorage<double, 3, 3, 3, 3>;
+static_assert(std::is_trivially_move_constructible<DenseStorageD3x3>::value, "DenseStorage not trivially_move_constructible");
+static_assert(std::is_trivially_move_assignable<DenseStorageD3x3>::value, "DenseStorage not trivially_move_assignable");
+#if !defined(EIGEN_DENSE_STORAGE_CTOR_PLUGIN)
+static_assert(std::is_trivially_copy_constructible<DenseStorageD3x3>::value, "DenseStorage not trivially_copy_constructible");
+static_assert(std::is_trivially_copy_assignable<DenseStorageD3x3>::value, "DenseStorage not trivially_copy_assignable");
+static_assert(std::is_trivially_copyable<DenseStorageD3x3>::value, "DenseStorage not trivially_copyable");
+#endif
+#endif
+
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_copy(int rows, int cols)
 {
-  static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols);
-  typedef DenseStorage<T,Size, Rows,Cols, 0> DenseStorageType;
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
   
-  const int rows = (Rows==Dynamic) ? 4 : Rows;
-  const int cols = (Cols==Dynamic) ? 3 : Cols;
   const int size = rows*cols;
   DenseStorageType reference(size, rows, cols);
   T* raw_reference = reference.data();
@@ -31,14 +41,11 @@ void dense_storage_copy()
     VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
 }
 
-template <typename T, int Rows, int Cols>
-void dense_storage_assignment()
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_assignment(int rows, int cols)
 {
-  static const int Size = ((Rows==Dynamic || Cols==Dynamic) ? Dynamic : Rows*Cols);
-  typedef DenseStorage<T,Size, Rows,Cols, 0> DenseStorageType;
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
   
-  const int rows = (Rows==Dynamic) ? 4 : Rows;
-  const int cols = (Cols==Dynamic) ? 3 : Cols;
   const int size = rows*cols;
   DenseStorageType reference(size, rows, cols);
   T* raw_reference = reference.data();
@@ -52,6 +59,34 @@ void dense_storage_assignment()
     VERIFY_IS_EQUAL(raw_reference[i], raw_copied_reference[i]);
 }
 
+template <typename T, int Size, int Rows, int Cols>
+void dense_storage_swap(int rows0, int cols0, int rows1, int cols1)
+{
+  typedef DenseStorage<T, Size, Rows, Cols, 0> DenseStorageType;
+  
+  const int size0 = rows0*cols0;
+  DenseStorageType a(size0, rows0, cols0);
+  for (int i=0; i<size0; ++i) {
+    a.data()[i] = static_cast<T>(i);
+  }
+  
+  const int size1 = rows1*cols1;
+  DenseStorageType b(size1, rows1, cols1);
+  for (int i=0; i<size1; ++i) {
+    b.data()[i] = static_cast<T>(-i);
+  }
+  
+  a.swap(b);
+  
+  for (int i=0; i<size0; ++i) {
+    VERIFY_IS_EQUAL(b.data()[i], static_cast<T>(i));
+  }
+  
+  for (int i=0; i<size1; ++i) {
+    VERIFY_IS_EQUAL(a.data()[i], static_cast<T>(-i));
+  }
+}
+
 template<typename T, int Size, std::size_t Alignment>
 void dense_storage_alignment()
 {
@@ -78,30 +113,78 @@ void dense_storage_alignment()
   #endif
 }
 
-EIGEN_DECLARE_TEST(dense_storage)
-{
-  dense_storage_copy<int,Dynamic,Dynamic>();  
-  dense_storage_copy<int,Dynamic,3>();
-  dense_storage_copy<int,4,Dynamic>();
-  dense_storage_copy<int,4,3>();
-
-  dense_storage_copy<float,Dynamic,Dynamic>();
-  dense_storage_copy<float,Dynamic,3>();
-  dense_storage_copy<float,4,Dynamic>();  
-  dense_storage_copy<float,4,3>();
+template<typename T>
+void dense_storage_tests() {
+  // Dynamic Storage.
+  dense_storage_copy<T,Dynamic,Dynamic,Dynamic>(4, 3);  
+  dense_storage_copy<T,Dynamic,Dynamic,3>(4, 3);
+  dense_storage_copy<T,Dynamic,4,Dynamic>(4, 3);
+  // Fixed Storage.
+  dense_storage_copy<T,12,4,3>(4, 3);
+  dense_storage_copy<T,12,Dynamic,Dynamic>(4, 3);
+  dense_storage_copy<T,12,4,Dynamic>(4, 3);
+  dense_storage_copy<T,12,Dynamic,3>(4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_copy<T,18,Dynamic,Dynamic>(4, 3);
+  dense_storage_copy<T,18,4,Dynamic>(4, 3);
+  dense_storage_copy<T,18,Dynamic,3>(4, 3);
   
-  dense_storage_assignment<int,Dynamic,Dynamic>();  
-  dense_storage_assignment<int,Dynamic,3>();
-  dense_storage_assignment<int,4,Dynamic>();
-  dense_storage_assignment<int,4,3>();
-
-  dense_storage_assignment<float,Dynamic,Dynamic>();
-  dense_storage_assignment<float,Dynamic,3>();
-  dense_storage_assignment<float,4,Dynamic>();  
-  dense_storage_assignment<float,4,3>(); 
+  // Dynamic Storage.
+  dense_storage_assignment<T,Dynamic,Dynamic,Dynamic>(4, 3);  
+  dense_storage_assignment<T,Dynamic,Dynamic,3>(4, 3);
+  dense_storage_assignment<T,Dynamic,4,Dynamic>(4, 3);
+  // Fixed Storage.
+  dense_storage_assignment<T,12,4,3>(4, 3);
+  dense_storage_assignment<T,12,Dynamic,Dynamic>(4, 3);
+  dense_storage_assignment<T,12,4,Dynamic>(4, 3);
+  dense_storage_assignment<T,12,Dynamic,3>(4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_assignment<T,18,Dynamic,Dynamic>(4, 3);
+  dense_storage_assignment<T,18,4,Dynamic>(4, 3);
+  dense_storage_assignment<T,18,Dynamic,3>(4, 3);
+  
+  // Dynamic Storage.
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(4, 3, 4, 3); 
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(4, 3, 2, 1);  
+  dense_storage_swap<T,Dynamic,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,Dynamic,Dynamic,3>(2, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,Dynamic,4,Dynamic>(4, 1, 4, 3);
+  // Fixed Storage.
+  dense_storage_swap<T,12,4,3>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(4, 3, 2, 1);
+  dense_storage_swap<T,12,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,12,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,12,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,12,4,Dynamic>(4, 1, 4, 3);
+  dense_storage_swap<T,12,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,12,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,12,Dynamic,3>(2, 3, 4, 3);
+  // Fixed Storage with Uninitialized Elements.
+  dense_storage_swap<T,18,Dynamic,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,18,Dynamic,Dynamic>(4, 3, 2, 1);
+  dense_storage_swap<T,18,Dynamic,Dynamic>(2, 1, 4, 3);
+  dense_storage_swap<T,18,4,Dynamic>(4, 3, 4, 3);
+  dense_storage_swap<T,18,4,Dynamic>(4, 3, 4, 1);
+  dense_storage_swap<T,18,4,Dynamic>(4, 1, 4, 3);
+  dense_storage_swap<T,18,Dynamic,3>(4, 3, 4, 3);
+  dense_storage_swap<T,18,Dynamic,3>(4, 3, 2, 3);
+  dense_storage_swap<T,18,Dynamic,3>(2, 3, 4, 3);
+  
+  dense_storage_alignment<T,16,8>();
+  dense_storage_alignment<T,16,16>();
+  dense_storage_alignment<T,16,32>();
+  dense_storage_alignment<T,16,64>();
+}
 
-  dense_storage_alignment<float,16,8>();
-  dense_storage_alignment<float,16,16>();
-  dense_storage_alignment<float,16,32>();
-  dense_storage_alignment<float,16,64>();
+EIGEN_DECLARE_TEST(dense_storage)
+{
+  dense_storage_tests<int>();
+  dense_storage_tests<float>();
+  dense_storage_tests<SafeScalar<float> >();
+  dense_storage_tests<AnnoyingScalar>();
 }
diff --git a/test/eigensolver_generalized_real.cpp b/test/eigensolver_generalized_real.cpp
index 95ed431db70a040e63f1183a9ccfce8cb41ed76c..a0c99b18a37a2c66bad959c99adede749d88f6dc 100644
--- a/test/eigensolver_generalized_real.cpp
+++ b/test/eigensolver_generalized_real.cpp
@@ -85,6 +85,42 @@ template<typename MatrixType> void generalized_eigensolver_real(const MatrixType
   }
 }
 
+template<typename MatrixType>
+void generalized_eigensolver_assert() {
+    GeneralizedEigenSolver<MatrixType> eig;
+    // all raise assert if uninitialized
+    VERIFY_RAISES_ASSERT(eig.info());
+    VERIFY_RAISES_ASSERT(eig.eigenvectors());
+    VERIFY_RAISES_ASSERT(eig.eigenvalues());
+    VERIFY_RAISES_ASSERT(eig.alphas());
+    VERIFY_RAISES_ASSERT(eig.betas());
+
+    // none raise assert after compute called
+    eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20));
+    VERIFY(eig.info() == Success);
+    eig.eigenvectors();
+    eig.eigenvalues();
+    eig.alphas();
+    eig.betas();
+
+    // eigenvectors() raises assert, if eigenvectors were not requested
+    eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20), false);
+    VERIFY(eig.info() == Success);
+    VERIFY_RAISES_ASSERT(eig.eigenvectors());
+    eig.eigenvalues();
+    eig.alphas();
+    eig.betas();
+
+    // all except info raise assert if realQZ did not converge
+    eig.setMaxIterations(0); // force real QZ to fail.
+    eig.compute(MatrixType::Random(20, 20), MatrixType::Random(20, 20));
+    VERIFY(eig.info() == NoConvergence);
+    VERIFY_RAISES_ASSERT(eig.eigenvectors());
+    VERIFY_RAISES_ASSERT(eig.eigenvalues());
+    VERIFY_RAISES_ASSERT(eig.alphas());
+    VERIFY_RAISES_ASSERT(eig.betas());
+}
+
 EIGEN_DECLARE_TEST(eigensolver_generalized_real)
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -98,6 +134,7 @@ EIGEN_DECLARE_TEST(eigensolver_generalized_real)
     CALL_SUBTEST_2( generalized_eigensolver_real(MatrixXd(2,2)) );
     CALL_SUBTEST_3( generalized_eigensolver_real(Matrix<double,1,1>()) );
     CALL_SUBTEST_4( generalized_eigensolver_real(Matrix2d()) );
+    CALL_SUBTEST_5( generalized_eigensolver_assert<MatrixXd>() );
     TEST_SET_BUT_UNUSED_VARIABLE(s)
   }
 }
diff --git a/test/eigensolver_selfadjoint.cpp b/test/eigensolver_selfadjoint.cpp
index 65b80c3fb1c71494808bd938037a5b79e2494a57..0fb2f4da766209ab394459bdc969e97bba1d6bb5 100644
--- a/test/eigensolver_selfadjoint.cpp
+++ b/test/eigensolver_selfadjoint.cpp
@@ -234,15 +234,21 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint)
 {
   int s = 0;
   for(int i = 0; i < g_repeat; i++) {
+
     // trivial test for 1x1 matrices:
     CALL_SUBTEST_1( selfadjointeigensolver(Matrix<float, 1, 1>()));
     CALL_SUBTEST_1( selfadjointeigensolver(Matrix<double, 1, 1>()));
+    CALL_SUBTEST_1( selfadjointeigensolver(Matrix<std::complex<double>, 1, 1>()));
+
     // very important to test 3x3 and 2x2 matrices since we provide special paths for them
     CALL_SUBTEST_12( selfadjointeigensolver(Matrix2f()) );
     CALL_SUBTEST_12( selfadjointeigensolver(Matrix2d()) );
+    CALL_SUBTEST_12( selfadjointeigensolver(Matrix2cd()) );
     CALL_SUBTEST_13( selfadjointeigensolver(Matrix3f()) );
     CALL_SUBTEST_13( selfadjointeigensolver(Matrix3d()) );
+    CALL_SUBTEST_13( selfadjointeigensolver(Matrix3cd()) );
     CALL_SUBTEST_2( selfadjointeigensolver(Matrix4d()) );
+    CALL_SUBTEST_2( selfadjointeigensolver(Matrix4cd()) );
     
     s = internal::random<int>(1,EIGEN_TEST_MAX_SIZE/4);
     CALL_SUBTEST_3( selfadjointeigensolver(MatrixXf(s,s)) );
@@ -254,6 +260,8 @@ EIGEN_DECLARE_TEST(eigensolver_selfadjoint)
     // some trivial but implementation-wise tricky cases
     CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(1,1)) );
     CALL_SUBTEST_4( selfadjointeigensolver(MatrixXd(2,2)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(1,1)) );
+    CALL_SUBTEST_5( selfadjointeigensolver(MatrixXcd(2,2)) );
     CALL_SUBTEST_6( selfadjointeigensolver(Matrix<double,1,1>()) );
     CALL_SUBTEST_7( selfadjointeigensolver(Matrix<double,2,2>()) );
   }
diff --git a/test/geo_hyperplane.cpp b/test/geo_hyperplane.cpp
index 2c89ecd2130ba84b03695e52e0dba9223e4ebea5..44b2f2aecb3e8e9b239f93cc586778f0b11b779d 100644
--- a/test/geo_hyperplane.cpp
+++ b/test/geo_hyperplane.cpp
@@ -172,11 +172,6 @@ template<typename Scalar> void hyperplane_alignment()
 
   VERIFY_IS_APPROX(p1->coeffs(), p2->coeffs());
   VERIFY_IS_APPROX(p1->coeffs(), p3->coeffs());
-  
-  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES > 0
-  if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
-    VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Plane3a));
-  #endif
 }
 
 
diff --git a/test/geo_orthomethods.cpp b/test/geo_orthomethods.cpp
index b7b660740284d3bb94b14693a4a1134bfb8f6eac..5f7ddb91f730e02428867bd1a019377ac83ad19f 100644
--- a/test/geo_orthomethods.cpp
+++ b/test/geo_orthomethods.cpp
@@ -73,8 +73,9 @@ template<typename Scalar> void orthomethods_3()
   // check mixed product
   typedef Matrix<RealScalar, 3, 1> RealVector3;
   RealVector3 rv1 = RealVector3::Random();
-  VERIFY_IS_APPROX(v1.cross(rv1.template cast<Scalar>()), v1.cross(rv1));
-  VERIFY_IS_APPROX(rv1.template cast<Scalar>().cross(v1), rv1.cross(v1));
+  v2 = rv1.template cast<Scalar>();
+  VERIFY_IS_APPROX(v1.cross(v2), v1.cross(rv1));
+  VERIFY_IS_APPROX(v2.cross(v1), rv1.cross(v1));
 }
 
 template<typename Scalar, int Size> void orthomethods(int size=Size)
diff --git a/test/geo_parametrizedline.cpp b/test/geo_parametrizedline.cpp
index 7135c8fa54d8162b280f8eee9577052abb43573b..e4b194abc407a7b45bbc437688462051b1132864 100644
--- a/test/geo_parametrizedline.cpp
+++ b/test/geo_parametrizedline.cpp
@@ -110,11 +110,6 @@ template<typename Scalar> void parametrizedline_alignment()
   VERIFY_IS_APPROX(p1->origin(), p3->origin());
   VERIFY_IS_APPROX(p1->direction(), p2->direction());
   VERIFY_IS_APPROX(p1->direction(), p3->direction());
-  
-  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
-    VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Line4a));
-  #endif
 }
 
 EIGEN_DECLARE_TEST(geo_parametrizedline)
diff --git a/test/geo_quaternion.cpp b/test/geo_quaternion.cpp
index c4a3162b357b5ef6034729083e7089147844ff5b..c561fc89d0e8f314354468824cce8a3bfd8d3511 100644
--- a/test/geo_quaternion.cpp
+++ b/test/geo_quaternion.cpp
@@ -218,10 +218,6 @@ template<typename Scalar> void mapQuaternion(void){
   VERIFY_IS_APPROX(q1.coeffs(), q2.coeffs());
   VERIFY_IS_APPROX(q1.coeffs(), q3.coeffs());
   VERIFY_IS_APPROX(q4.coeffs(), q3.coeffs());
-  #ifdef EIGEN_VECTORIZE
-  if(internal::packet_traits<Scalar>::Vectorizable)
-    VERIFY_RAISES_ASSERT((MQuaternionA(array3unaligned)));
-  #endif
     
   VERIFY_IS_APPROX(mq1 * (mq1.inverse() * v1), v1);
   VERIFY_IS_APPROX(mq1 * (mq1.conjugate() * v1), v1);
@@ -281,10 +277,6 @@ template<typename Scalar> void quaternionAlignment(void){
 
   VERIFY_IS_APPROX(q1->coeffs(), q2->coeffs());
   VERIFY_IS_APPROX(q1->coeffs(), q3->coeffs());
-  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  if(internal::packet_traits<Scalar>::Vectorizable && internal::packet_traits<Scalar>::size<=4)
-    VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(arrayunaligned)) QuaternionA));
-  #endif
 }
 
 template<typename PlainObjectType> void check_const_correctness(const PlainObjectType&)
diff --git a/test/geo_transformations.cpp b/test/geo_transformations.cpp
index d433561cbb1387e642c71e47d08241a98625a64d..72c6edac14708ffb3969855dd7096ca94504773f 100644
--- a/test/geo_transformations.cpp
+++ b/test/geo_transformations.cpp
@@ -582,11 +582,6 @@ template<typename Scalar> void transform_alignment()
   VERIFY_IS_APPROX(p1->matrix(), p3->matrix());
   
   VERIFY_IS_APPROX( (*p1) * (*p1), (*p2)*(*p3));
-  
-  #if defined(EIGEN_VECTORIZE) && EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  if(internal::packet_traits<Scalar>::Vectorizable)
-    VERIFY_RAISES_ASSERT((::new(reinterpret_cast<void*>(array3u)) Projective3a));
-  #endif
 }
 
 template<typename Scalar, int Dim, int Options> void transform_products()
diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index bf8dcacde280039e0c2eb4c329659ca7cd5bdff8..e424a93c9e86959635d64734588a97e83158f072 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -138,10 +138,12 @@ struct complex_operators {
     out[out_idx++] = a / numext::real(b);
     out[out_idx++] = numext::real(a) / b;
     
+#if !defined(EIGEN_COMP_MSVC)
     out[out_idx] = a; out[out_idx++] += b;
     out[out_idx] = a; out[out_idx++] -= b;
     out[out_idx] = a; out[out_idx++] *= b;
     out[out_idx] = a; out[out_idx++] /= b;
+#endif
     
     const ComplexType true_value = ComplexType(ValueType(1), ValueType(0));
     const ComplexType false_value = ComplexType(ValueType(0), ValueType(0));
@@ -188,6 +190,7 @@ struct complex_operators {
     res.segment(block_idx, size) = x1.real().array() / x2.array();
     block_idx += size;
     
+#if !defined(EIGEN_COMP_MSVC)
     res.segment(block_idx, size) = x1; res.segment(block_idx, size) += x2;
     block_idx += size;
     res.segment(block_idx, size) = x1; res.segment(block_idx, size) -= x2;
@@ -196,19 +199,19 @@ struct complex_operators {
     block_idx += size;
     res.segment(block_idx, size) = x1; res.segment(block_idx, size).array() /= x2.array();
     block_idx += size;
+#endif
 
-    // Equality comparisons currently not functional on device
-    //   (std::equal_to<T> is host-only).
-    // const T true_vector = T::Constant(true_value);
-    // const T false_vector = T::Constant(false_value);
-    // res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
-    // block_idx += size;
+    const T true_vector = T::Constant(true_value);
+    const T false_vector = T::Constant(false_value);
+    res.segment(block_idx, size) = (x1 == x2 ? true_vector : false_vector);
+    block_idx += size;
+    // Mixing types in equality comparison does not work.
     // res.segment(block_idx, size) = (x1 == x2.real() ? true_vector : false_vector);
     // block_idx += size;
     // res.segment(block_idx, size) = (x1.real() == x2 ? true_vector : false_vector);
     // block_idx += size;
-    // res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
-    // block_idx += size;
+    res.segment(block_idx, size) = (x1 != x2 ? true_vector : false_vector);
+    block_idx += size;
     // res.segment(block_idx, size) = (x1 != x2.real() ? true_vector : false_vector);
     // block_idx += size;
     // res.segment(block_idx, size) = (x1.real() != x2 ? true_vector : false_vector);
diff --git a/test/half_float.cpp b/test/half_float.cpp
index 729de1bc725d2b0d4db43c7f9e48f90de92b50be..ffb3215b9d85fcd574122654a907ddfd2d08d923 100644
--- a/test/half_float.cpp
+++ b/test/half_float.cpp
@@ -224,6 +224,8 @@ void test_comparison()
 
 void test_basic_functions()
 {
+  const float PI = static_cast<float>(EIGEN_PI);
+
   VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f);
   VERIFY_IS_EQUAL(float(abs(half(3.5f))), 3.5f);
   VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f);
@@ -251,8 +253,8 @@ void test_basic_functions()
 
   VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f);
   VERIFY_IS_EQUAL(float(exp(half(0.0f))), 1.0f);
-  VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
-  VERIFY_IS_APPROX(float(exp(half(EIGEN_PI))), 20.f + float(EIGEN_PI));
+  VERIFY_IS_APPROX(float(numext::exp(half(PI))), 20.f + PI);
+  VERIFY_IS_APPROX(float(exp(half(PI))), 20.f + PI);
 
   VERIFY_IS_EQUAL(float(numext::expm1(half(0.0f))), 0.0f);
   VERIFY_IS_EQUAL(float(expm1(half(0.0f))), 0.0f);
@@ -277,25 +279,26 @@ void test_basic_functions()
 
 void test_trigonometric_functions()
 {
+  const float PI = static_cast<float>(EIGEN_PI);
   VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f)));
   VERIFY_IS_APPROX(cos(half(0.0f)), half(cosf(0.0f)));
-  VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI)));
-  // VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
-  // VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
+  VERIFY_IS_APPROX(numext::cos(half(PI)), half(cosf(PI)));
+  // VERIFY_IS_APPROX(numext::cos(half(PI/2)), half(cosf(PI/2)));
+  // VERIFY_IS_APPROX(numext::cos(half(3*PI/2)), half(cosf(3*PI/2)));
   VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f)));
 
   VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f)));
   VERIFY_IS_APPROX(sin(half(0.0f)), half(sinf(0.0f)));
-  //  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI)));
-  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2)));
-  VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2)));
+  //  VERIFY_IS_APPROX(numext::sin(half(PI)), half(sinf(PI)));
+  VERIFY_IS_APPROX(numext::sin(half(PI/2)), half(sinf(PI/2)));
+  VERIFY_IS_APPROX(numext::sin(half(3*PI/2)), half(sinf(3*PI/2)));
   VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f)));
 
   VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f)));
   VERIFY_IS_APPROX(tan(half(0.0f)), half(tanf(0.0f)));
-  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI)));
-  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2)));
-  //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2)));
+  //  VERIFY_IS_APPROX(numext::tan(half(PI)), half(tanf(PI)));
+  //  VERIFY_IS_APPROX(numext::tan(half(PI/2)), half(tanf(PI/2)));
+  //VERIFY_IS_APPROX(numext::tan(half(3*PI/2)), half(tanf(3*PI/2)));
   VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f)));
 }
 
diff --git a/test/inverse.cpp b/test/inverse.cpp
index 99f9e0c9b238fd280c89ea4df93a397f89958499..9cedfa1e1069ce7a15e4056a6ab95fe4cccc7405 100644
--- a/test/inverse.cpp
+++ b/test/inverse.cpp
@@ -135,6 +135,8 @@ EIGEN_DECLARE_TEST(inverse)
     CALL_SUBTEST_5( inverse(MatrixXf(s,s)) );
     TEST_SET_BUT_UNUSED_VARIABLE(s)
     CALL_SUBTEST_5( inverse_zerosized<float>() );
+    CALL_SUBTEST_5( inverse(MatrixXf(0, 0)) );
+    CALL_SUBTEST_5( inverse(MatrixXf(1, 1)) );
     
     s = internal::random<int>(25,100);
     CALL_SUBTEST_6( inverse(MatrixXcd(s,s)) );
diff --git a/test/jacobisvd.cpp b/test/jacobisvd.cpp
index 89484d97106a17c80e537db61d7060c257f2c8ce..5b15c5a27bff8996be7ab66dc49581fcfd226840 100644
--- a/test/jacobisvd.cpp
+++ b/test/jacobisvd.cpp
@@ -36,6 +36,9 @@ void jacobisvd(const MatrixType& a = MatrixType(), bool pickrandom = true)
 template<typename MatrixType> void jacobisvd_verify_assert(const MatrixType& m)
 {
   svd_verify_assert<JacobiSVD<MatrixType> >(m);
+  svd_verify_assert<JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner> >(m, true);
+  svd_verify_assert<JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner> >(m);
+  svd_verify_assert<JacobiSVD<MatrixType, HouseholderQRPreconditioner> >(m);
   Index rows = m.rows();
   Index cols = m.cols();
 
diff --git a/test/main.h b/test/main.h
index 07f3794ac8be407cc859bc44ae2afa14902d8d21..19bbf1b8191d7b0e07c23ffe506e8ef6c351f6ee 100644
--- a/test/main.h
+++ b/test/main.h
@@ -111,6 +111,9 @@ struct imag {};
 // `I` may be defined by complex.h:
 #define I  FORBIDDEN_IDENTIFIER
 
+// _res is defined by resolv.h
+#define _res FORBIDDEN_IDENTIFIER
+
 // Unit tests calling Eigen's blas library must preserve the default blocking size
 // to avoid troubles.
 #ifndef EIGEN_NO_DEBUG_SMALL_PRODUCT_BLOCKS
@@ -391,6 +394,8 @@ inline void verify_impl(bool condition, const char *testname, const char *file,
 #define VERIFY_IS_NOT_MUCH_SMALLER_THAN(a, b) VERIFY(!test_isMuchSmallerThan(a, b))
 #define VERIFY_IS_APPROX_OR_LESS_THAN(a, b) VERIFY(test_isApproxOrLessThan(a, b))
 #define VERIFY_IS_NOT_APPROX_OR_LESS_THAN(a, b) VERIFY(!test_isApproxOrLessThan(a, b))
+#define VERIFY_IS_CWISE_EQUAL(a, b) VERIFY(verifyIsCwiseApprox(a, b, true))
+#define VERIFY_IS_CWISE_APPROX(a, b) VERIFY(verifyIsCwiseApprox(a, b, false))
 
 #define VERIFY_IS_UNITARY(a) VERIFY(test_isUnitary(a))
 
@@ -422,7 +427,13 @@ template<> inline long double test_precision<std::complex<long double> >() { ret
 
 #define EIGEN_TEST_SCALAR_TEST_OVERLOAD(TYPE)                             \
   inline bool test_isApprox(TYPE a, TYPE b)                               \
-  { return internal::isApprox(a, b, test_precision<TYPE>()); }            \
+  { return numext::equal_strict(a, b) ||                                  \
+      ((numext::isnan)(a) && (numext::isnan)(b)) ||                       \
+      (internal::isApprox(a, b, test_precision<TYPE>())); }               \
+  inline bool test_isCwiseApprox(TYPE a, TYPE b, bool exact)              \
+  { return numext::equal_strict(a, b) ||                                  \
+      ((numext::isnan)(a) && (numext::isnan)(b)) ||                       \
+      (!exact && internal::isApprox(a, b, test_precision<TYPE>())); }     \
   inline bool test_isMuchSmallerThan(TYPE a, TYPE b)                      \
   { return internal::isMuchSmallerThan(a, b, test_precision<TYPE>()); }   \
   inline bool test_isApproxOrLessThan(TYPE a, TYPE b)                     \
@@ -592,6 +603,22 @@ inline bool verifyIsApprox(const Type1& a, const Type2& b)
   return ret;
 }
 
+// verifyIsCwiseApprox is a wrapper to test_isCwiseApprox that outputs the relative difference magnitude if the test fails.
+template<typename Type1, typename Type2>
+inline bool verifyIsCwiseApprox(const Type1& a, const Type2& b, bool exact)
+{
+  bool ret = test_isCwiseApprox(a,b,exact);
+  if(!ret) {
+    if (exact) {
+      std::cerr << "Values are not an exact match";
+    } else {
+      std::cerr << "Difference too large wrt tolerance " << get_test_precision(a);
+    }
+    std::cerr << ", relative error is: " << test_relative_error(a,b) << std::endl;
+  }
+  return ret;
+}
+
 // The idea behind this function is to compare the two scalars a and b where
 // the scalar ref is a hint about the expected order of magnitude of a and b.
 // WARNING: the scalar a and b must be positive
diff --git a/test/mixingtypes.cpp b/test/mixingtypes.cpp
index d450dbff8bbfd324e5b7b19b0329dbebef52ba45..2af7b888747598d546615e2d92de8ff3a8deb937 100644
--- a/test/mixingtypes.cpp
+++ b/test/mixingtypes.cpp
@@ -139,11 +139,12 @@ template<int SizeAtCompileType> void mixingtypes(int size = SizeAtCompileType)
   VERIFY_MIX_SCALAR(scd - vd.array() , scd - vd.template cast<complex<double> >().array());
 
   // check scalar powers
-  VERIFY_MIX_SCALAR( pow(vcf.array(), sf),        Eigen::pow(vcf.array(), complex<float>(sf)) );
-  VERIFY_MIX_SCALAR( vcf.array().pow(sf) ,        Eigen::pow(vcf.array(), complex<float>(sf)) );
+  // NOTE: scalar exponents use a unary op.
+  VERIFY_IS_APPROX( pow(vcf.array(), sf),        Eigen::pow(vcf.array(), complex<float>(sf)) );
+  VERIFY_IS_APPROX( vcf.array().pow(sf) ,        Eigen::pow(vcf.array(), complex<float>(sf)) );
   VERIFY_MIX_SCALAR( pow(sd, vcd.array()),        Eigen::pow(complex<double>(sd), vcd.array()) );
-  VERIFY_MIX_SCALAR( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
-  VERIFY_MIX_SCALAR( vf.array().pow(scf) ,        Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
+  VERIFY_IS_APPROX( Eigen::pow(vf.array(), scf), Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
+  VERIFY_IS_APPROX( vf.array().pow(scf) ,        Eigen::pow(vf.template cast<complex<float> >().array(), scf) );
   VERIFY_MIX_SCALAR( Eigen::pow(scd, vd.array()), Eigen::pow(scd, vd.template cast<complex<double> >().array()) );
 
   // check dot product
diff --git a/test/nestbyvalue.cpp b/test/nestbyvalue.cpp
index c5356bc24cb8e571309b5cb674aecc41602fedd2..3a86bea5057e7da82d7a9e79ed780b8ebbfd44a0 100644
--- a/test/nestbyvalue.cpp
+++ b/test/nestbyvalue.cpp
@@ -26,7 +26,7 @@ EIGEN_DECLARE_TEST(nestbyvalue)
   for(int i = 0; i < g_repeat; i++) {
     Index rows = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
     Index cols = internal::random<Index>(1,EIGEN_TEST_MAX_SIZE);
-    MatrixXd a = MatrixXd(rows,cols);
+    MatrixXd a = MatrixXd::Random(rows,cols);
     nb_temporaries = 0;
     XprType x = get_xpr_with_temps(a);
     VERIFY_IS_EQUAL(nb_temporaries,6);
diff --git a/test/numext.cpp b/test/numext.cpp
index cf1ca173dd66f3abc49974df24f09843fedcd2db..8a2fde5015bc50baaf41d0c3b3eb8c2662626f20 100644
--- a/test/numext.cpp
+++ b/test/numext.cpp
@@ -61,6 +61,20 @@ void check_abs() {
   }
 }
 
+template<typename T>
+void check_arg() {
+  typedef typename NumTraits<T>::Real Real;
+  VERIFY_IS_EQUAL(numext::abs(T(0)), T(0));
+  VERIFY_IS_EQUAL(numext::abs(T(1)), T(1));
+
+  for(int k=0; k<100; ++k)
+  {
+    T x = internal::random<T>();
+    Real y = numext::arg(x);
+    VERIFY_IS_APPROX( y, std::arg(x) );
+  }
+}
+
 template<typename T>
 struct check_sqrt_impl {
   static void run() {
@@ -242,10 +256,12 @@ EIGEN_DECLARE_TEST(numext) {
     CALL_SUBTEST( check_abs<float>() );
     CALL_SUBTEST( check_abs<double>() );
     CALL_SUBTEST( check_abs<long double>() );
-
     CALL_SUBTEST( check_abs<std::complex<float> >() );
     CALL_SUBTEST( check_abs<std::complex<double> >() );
 
+    CALL_SUBTEST( check_arg<std::complex<float> >() );
+    CALL_SUBTEST( check_arg<std::complex<double> >() );
+
     CALL_SUBTEST( check_sqrt<float>() );
     CALL_SUBTEST( check_sqrt<double>() );
     CALL_SUBTEST( check_sqrt<std::complex<float> >() );
diff --git a/test/packetmath.cpp b/test/packetmath.cpp
index 67d329a67ff03bdb5aa633a2e8f46c5b88c89ee0..518b801b9f274448cc3e51dfaed5d1ca7fb11575 100644
--- a/test/packetmath.cpp
+++ b/test/packetmath.cpp
@@ -52,6 +52,12 @@ inline T REF_FREXP(const T& x, T& exp) {
   EIGEN_USING_STD(frexp)
   const T out = static_cast<T>(frexp(x, &iexp));
   exp = static_cast<T>(iexp);
+  
+  // The exponent value is unspecified if the input is inf or NaN, but MSVC
+  // seems to set it to 1.  We need to set it back to zero for consistency.
+  if (!(numext::isfinite)(x)) {
+    exp = T(0);
+  }
   return out;
 }
 
@@ -279,19 +285,84 @@ void packetmath_boolean_mask_ops() {
   CHECK_CWISE2_IF(true, internal::pcmp_eq, internal::pcmp_eq);
 }
 
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops_real() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+
+  //Test (-0) <=/< (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_lt_or_nan, internal::pcmp_lt_or_nan);
+}
+
+template <typename Scalar, typename Packet>
+void packetmath_boolean_mask_ops_notcomplex() {
+  const int PacketSize = internal::unpacket_traits<Packet>::size;
+  const int size = 2 * PacketSize;
+  EIGEN_ALIGN_MAX Scalar data1[size];
+  EIGEN_ALIGN_MAX Scalar data2[size];
+  EIGEN_ALIGN_MAX Scalar ref[size];
+
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = internal::random<Scalar>();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+
+  //Test (-0) <=/< (0) for signed operations
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = Scalar(-0.0);
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+
+  //Test NaN
+  for (int i = 0; i < PacketSize; ++i) {
+    data1[i] = NumTraits<Scalar>::quiet_NaN();
+    data1[i + PacketSize] = internal::random<bool>() ? data1[i] : Scalar(0);
+  }
+  CHECK_CWISE2_IF(true, internal::pcmp_le, internal::pcmp_le);
+  CHECK_CWISE2_IF(true, internal::pcmp_lt, internal::pcmp_lt);
+}
+
 // Packet16b representing bool does not support ptrue, pandnot or pcmp_eq, since the scalar path
 // (for some compilers) compute the bitwise and with 0x1 of the results to keep the value in [0,1].
 template<>
 void packetmath_boolean_mask_ops<bool, internal::packet_traits<bool>::type>() {}
+template<>
+void packetmath_boolean_mask_ops_notcomplex<bool, internal::packet_traits<bool>::type>() {}
 
 template <typename Scalar, typename Packet>
 void packetmath_minus_zero_add() {
   const int PacketSize = internal::unpacket_traits<Packet>::size;
   const int size = 2 * PacketSize;
-  EIGEN_ALIGN_MAX Scalar data1[size];
-  EIGEN_ALIGN_MAX Scalar data2[size];
-  EIGEN_ALIGN_MAX Scalar ref[size];
-
+  EIGEN_ALIGN_MAX Scalar data1[size] = {};
+  EIGEN_ALIGN_MAX Scalar data2[size] = {};
+  EIGEN_ALIGN_MAX Scalar ref[size] = {};
+  
   for (int i = 0; i < PacketSize; ++i) {
     data1[i] = Scalar(-0.0);
     data1[i + PacketSize] = Scalar(-0.0);
@@ -453,9 +524,7 @@ void packetmath() {
   for (int i = 0; i < PacketSize; ++i) ref[0] += data1[i];
   VERIFY(test::isApproxAbs(ref[0], internal::predux(internal::pload<Packet>(data1)), refvalue) && "internal::predux");
 
-  if (PacketSize == 8 && internal::unpacket_traits<typename internal::unpacket_traits<Packet>::half>::size ==
-                             4)  // so far, predux_half_downto4 is only required in such a case
-  {
+  if (!internal::is_same<Packet, typename internal::unpacket_traits<Packet>::half>::value) {
     int HalfPacketSize = PacketSize > 4 ? PacketSize / 2 : PacketSize;
     for (int i = 0; i < HalfPacketSize; ++i) ref[i] = Scalar(0);
     for (int i = 0; i < PacketSize; ++i) ref[i % HalfPacketSize] += data1[i];
@@ -483,6 +552,27 @@ void packetmath() {
     }
   }
 
+  // GeneralBlockPanelKernel also checks PacketBlock<Packet,(PacketSize%4)==0?4:PacketSize>;
+  if (PacketSize > 4 && PacketSize % 4 == 0) {
+    internal::PacketBlock<Packet, PacketSize%4==0?4:PacketSize> kernel2;
+    for (int i = 0; i < 4; ++i) {
+      kernel2.packet[i] = internal::pload<Packet>(data1 + i * PacketSize);
+    }
+    ptranspose(kernel2);
+    int data_counter = 0;
+    for (int i = 0; i < PacketSize; ++i) {
+      for (int j = 0; j < 4; ++j) {
+        data2[data_counter++] = data1[j*PacketSize + i];
+      }
+    }
+    for (int i = 0; i < 4; ++i) {
+      internal::pstore(data3, kernel2.packet[i]);
+      for (int j = 0; j < PacketSize; ++j) {
+        VERIFY(test::isApproxAbs(data3[j], data2[i*PacketSize + j], refvalue) && "ptranspose");
+      }
+    }
+  }
+
   if (PacketTraits::HasBlend) {
     Packet thenPacket = internal::pload<Packet>(data1);
     Packet elsePacket = internal::pload<Packet>(data2);
@@ -547,9 +637,19 @@ void packetmath_real() {
   const int PacketSize = internal::unpacket_traits<Packet>::size;
 
   const int size = PacketSize * 4;
-  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4];
-  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4];
-  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4];
+  EIGEN_ALIGN_MAX Scalar data1[PacketSize * 4] = {};
+  EIGEN_ALIGN_MAX Scalar data2[PacketSize * 4] = {};
+  EIGEN_ALIGN_MAX Scalar ref[PacketSize * 4] = {};
+  
+  // Negate with -0.
+  if (PacketTraits::HasNegate) {
+    test::packet_helper<PacketTraits::HasNegate,Packet> h;
+    data1[0] = Scalar(-0);
+    h.store(data2, internal::pnegate(h.load(data1)));
+    typedef typename internal::make_unsigned<typename internal::make_integer<Scalar>::type>::type Bits;
+    Bits bits = numext::bit_cast<Bits>(data2[0]);
+    VERIFY_IS_EQUAL(bits, static_cast<Bits>(Bits(1)<<(sizeof(Scalar)*CHAR_BIT - 1)));
+  }
 
   for (int i = 0; i < size; ++i) {
     data1[i] = Scalar(internal::random<double>(0, 1) * std::pow(10., internal::random<double>(-6, 6)));
@@ -574,6 +674,8 @@ void packetmath_real() {
   CHECK_CWISE1_EXACT_IF(PacketTraits::HasCeil, numext::ceil, internal::pceil);
   CHECK_CWISE1_EXACT_IF(PacketTraits::HasFloor, numext::floor, internal::pfloor);
   CHECK_CWISE1_EXACT_IF(PacketTraits::HasRint, numext::rint, internal::print);
+
+  packetmath_boolean_mask_ops_real<Scalar,Packet>();
   
   // Rounding edge cases.
   if (PacketTraits::HasRound || PacketTraits::HasCeil || PacketTraits::HasFloor || PacketTraits::HasRint) {
@@ -1020,6 +1122,8 @@ void packetmath_notcomplex() {
     CHECK_CWISE2_IF(PacketTraits::HasMin, propagate_nan_min, (internal::pmin<PropagateNaN>));
     CHECK_CWISE2_IF(PacketTraits::HasMax, propagate_nan_max, internal::pmax<PropagateNaN>);
   }
+
+  packetmath_boolean_mask_ops_notcomplex<Scalar, Packet>();
 }
 
 template <typename Scalar, typename Packet, bool ConjLhs, bool ConjRhs>
diff --git a/test/prec_inverse_4x4.cpp b/test/prec_inverse_4x4.cpp
index 07246646793ff97b047160428f1f5024ce0305ef..86f057118cf1c3ba1e906eb7fd6160c4cbd4636f 100644
--- a/test/prec_inverse_4x4.cpp
+++ b/test/prec_inverse_4x4.cpp
@@ -30,18 +30,17 @@ template<typename MatrixType> void inverse_general_4x4(int repeat)
 {
   using std::abs;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
   double error_sum = 0., error_max = 0.;
   for(int i = 0; i < repeat; ++i)
   {
     MatrixType m;
-    RealScalar absdet;
+    bool is_invertible;
     do {
       m = MatrixType::Random();
-      absdet = abs(m.determinant());
-    } while(absdet < NumTraits<Scalar>::epsilon());
+      is_invertible = Eigen::FullPivLU<MatrixType>(m).isInvertible();
+    } while(!is_invertible);
     MatrixType inv = m.inverse();
-    double error = double( (m*inv-MatrixType::Identity()).norm() * absdet / NumTraits<Scalar>::epsilon() );
+    double error = double( (m*inv-MatrixType::Identity()).norm());
     error_sum += error;
     error_max = (std::max)(error_max, error);
   }
diff --git a/test/product_small.cpp b/test/product_small.cpp
index 1d6df6e58765451d0bbf58508111816a85b75620..fec7f5658f9b6c920740e5ac8aec7a91a3beddda 100644
--- a/test/product_small.cpp
+++ b/test/product_small.cpp
@@ -70,7 +70,7 @@ void test_dynamic_bool()
   for(Index i=0;i<C.rows();++i)
     for(Index j=0;j<C.cols();++j)
       for(Index k=0;k<A.cols();++k)
-       D.coeffRef(i,j) |= A.coeff(i,k) & B.coeff(k,j);
+       D.coeffRef(i,j) |= (A.coeff(i,k) && B.coeff(k,j));
   C += A * B;
   VERIFY_IS_EQUAL(C, D);
 
diff --git a/test/ref.cpp b/test/ref.cpp
index ebfc70d3d1ce4e4280df37262c9a8901332fd4bc..63eb65e27c87f5b7b56b6f8452c367f2113aa375 100644
--- a/test/ref.cpp
+++ b/test/ref.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 20013 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2013 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
diff --git a/test/reshape.cpp b/test/reshape.cpp
index 7b16742a23faeaafadad3fc95dec7a9918c0db7b..1f966ebaee6e4e4bed8223df6953005d7e2ca190 100644
--- a/test/reshape.cpp
+++ b/test/reshape.cpp
@@ -193,6 +193,24 @@ void reshape4x4(MatType m)
   }
 }
 
+template<typename BlockType>
+void reshape_block(const BlockType& M) {
+  typename BlockType::PlainObject dense = M.eval();
+  Index rows = M.size() / 2;
+  Index cols = M.size() / rows;
+  VERIFY_IS_EQUAL(dense.reshaped(rows, cols), M.reshaped(rows, cols));
+  
+  for (Index i=0; i<rows; ++i) {
+    VERIFY_IS_EQUAL(dense.reshaped(rows, cols).row(i),
+                    M.reshaped(rows, cols).row(i));
+  }
+  
+  for (Index j = 0; j<cols; ++j) {
+    VERIFY_IS_EQUAL(dense.reshaped(rows, cols).col(j),
+                    M.reshaped(rows, cols).col(j));
+  }
+}
+
 EIGEN_DECLARE_TEST(reshape)
 {
   typedef Matrix<int,Dynamic,Dynamic,RowMajor> RowMatrixXi;
@@ -213,4 +231,5 @@ EIGEN_DECLARE_TEST(reshape)
 
   CALL_SUBTEST(reshape4x4(rmx));
   CALL_SUBTEST(reshape4x4(rm4));
+  CALL_SUBTEST(reshape_block(rm4.col(1)));
 }
diff --git a/test/rvalue_types.cpp b/test/rvalue_types.cpp
index c20a32f790afbe62ff7240988731912a94896f46..2c9999ce80851e46579b405669bf8d912a77c6bf 100644
--- a/test/rvalue_types.cpp
+++ b/test/rvalue_types.cpp
@@ -13,41 +13,12 @@
 #if EIGEN_HAS_CXX11
 #include "MovableScalar.h"
 #endif
+#include "SafeScalar.h"
 
 #include <Eigen/Core>
 
 using internal::UIntPtr;
 
-// A Scalar that asserts for uninitialized access.
-template<typename T>
-class SafeScalar {
- public:
-  SafeScalar() : initialized_(false) {}
-  SafeScalar(const SafeScalar& other) {
-    *this = other;
-  }
-  SafeScalar& operator=(const SafeScalar& other) {
-    val_ = T(other);
-    initialized_ = true;
-    return *this;
-  }
-  
-  SafeScalar(T val) : val_(val), initialized_(true) {}
-  SafeScalar& operator=(T val) {
-    val_ = val;
-    initialized_ = true;
-  }
-  
-  operator T() const {
-    VERIFY(initialized_ && "Uninitialized access.");
-    return val_;
-  }
- 
- private:
-  T val_;
-  bool initialized_;
-};
-
 #if EIGEN_HAS_RVALUE_REFERENCES
 template <typename MatrixType>
 void rvalue_copyassign(const MatrixType& m)
diff --git a/test/schur_complex.cpp b/test/schur_complex.cpp
index 03e17e81defd09f67f87866a34ba684aa629f5d6..26acb8c3ac3f67618bfe6363a76fe205e77eb9d3 100644
--- a/test/schur_complex.cpp
+++ b/test/schur_complex.cpp
@@ -54,7 +54,8 @@ template<typename MatrixType> void schur(int size = MatrixType::ColsAtCompileTim
   VERIFY_IS_EQUAL(cs3.matrixT(), cs1.matrixT());
   VERIFY_IS_EQUAL(cs3.matrixU(), cs1.matrixU());
   cs3.setMaxIterations(1).compute(A);
-  VERIFY_IS_EQUAL(cs3.info(), size > 1 ? NoConvergence : Success);
+  // The schur decomposition does often converge with a single iteration.
+  // VERIFY_IS_EQUAL(cs3.info(), size > 1 ? NoConvergence : Success);
   VERIFY_IS_EQUAL(cs3.getMaxIterations(), 1);
 
   MatrixType Atriangular = A;
diff --git a/test/sparse_block.cpp b/test/sparse_block.cpp
index f9668102c431dcda56f2ae2cda1ebc829814899f..b4905b0531743a5d3eefa066d6994382b5eae4ec 100644
--- a/test/sparse_block.cpp
+++ b/test/sparse_block.cpp
@@ -315,8 +315,9 @@ EIGEN_DECLARE_TEST(sparse_block)
     
     CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,ColMajor,short int>(short(r), short(c))) ));
     CALL_SUBTEST_4(( sparse_block(SparseMatrix<double,RowMajor,short int>(short(r), short(c))) ));
-
+#ifndef EIGEN_TEST_ANNOYING_SCALAR_DONT_THROW
     AnnoyingScalar::dont_throw = true;
+#endif
     CALL_SUBTEST_5((  sparse_block(SparseMatrix<AnnoyingScalar>(r,c)) ));
   }
 }
diff --git a/test/sparse_ref.cpp b/test/sparse_ref.cpp
index 12b6f8a9dd7b66b6b9f9badb105e54e45bb50178..8f33af858ca56e8e0d32625acc5c03ac7581f75f 100644
--- a/test/sparse_ref.cpp
+++ b/test/sparse_ref.cpp
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 20015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
diff --git a/test/sparse_solver.h b/test/sparse_solver.h
index 58927944bdd98ac86133025c79a9437daefe9018..6f95e2fa7a9df00d41aafd890c6073df9100536f 100644
--- a/test/sparse_solver.h
+++ b/test/sparse_solver.h
@@ -99,6 +99,13 @@ void check_sparse_solving(Solver& solver, const typename Solver::MatrixType& A,
     VERIFY(solver.info() == Success && "solving failed when using Map");
     VERIFY(oldb.isApprox(bm) && "sparse solver testing: the rhs should not be modified!");
     VERIFY(xm.isApprox(refX,test_precision<Scalar>()));
+    
+    // Test with a Map and non-unit stride.
+    Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic> out(2*xm.rows(), 2*xm.cols());
+    out.setZero();
+    Eigen::Map<DenseRhs, 0, Stride<Eigen::Dynamic, 2> > outm(out.data(), xm.rows(), xm.cols(), Stride<Eigen::Dynamic, 2>(2 * xm.rows(), 2));
+    outm = solver.solve(bm);
+    VERIFY(outm.isApprox(refX,test_precision<Scalar>()));
   }
   
   // if not too large, do some extra check:
diff --git a/test/stdlist_overload.cpp b/test/stdlist_overload.cpp
index a78516e24eba1a093a0141cfbf3a3b6ed8b383ca..843e28c3cbf87a3e521d07f28c6c5d5fe58d8756 100644
--- a/test/stdlist_overload.cpp
+++ b/test/stdlist_overload.cpp
@@ -63,7 +63,7 @@ void check_stdlist_matrix(const MatrixType& m)
     ++itw;
   }
 
-  v.resize(21);
+  v.resize(21, MatrixType::Zero(rows, cols));
   set(v, 20, x);
   VERIFY_IS_APPROX(*get(v, 20), x);
   v.resize(22,y);
diff --git a/test/stdvector.cpp b/test/stdvector.cpp
index 18de240c6e68f490b590808cef56ad0515fc6ff2..9c023d656483fc6914c1ab15b0c549adefb3d34d 100644
--- a/test/stdvector.cpp
+++ b/test/stdvector.cpp
@@ -52,7 +52,7 @@ void check_stdvector_transform(const TransformType&)
 {
   typedef typename TransformType::MatrixType MatrixType;
   TransformType x(MatrixType::Random()), y(MatrixType::Random());
-  std::vector<TransformType,Eigen::aligned_allocator<TransformType> > v(10), w(20, y);
+  std::vector<TransformType,Eigen::aligned_allocator<TransformType> > v(10, TransformType(MatrixType::Zero())), w(20, y);
   v[5] = x;
   w[6] = v[5];
   VERIFY_IS_APPROX(w[6], v[5]);
@@ -124,7 +124,7 @@ void std_vector_gcc_warning()
 {
   typedef Eigen::Vector3f T;
   std::vector<T, Eigen::aligned_allocator<T> > v;
-  v.push_back(T());
+  v.push_back(T::Zero());
 }
 
 EIGEN_DECLARE_TEST(stdvector)
diff --git a/test/svd_common.h b/test/svd_common.h
index bd62edcc8442c8375e6551c4b4e449f3d83581bb..eae4c0bfe31e98b7e3cbe157768300f813220cee 100644
--- a/test/svd_common.h
+++ b/test/svd_common.h
@@ -462,7 +462,7 @@ void svd_preallocate()
 }
 
 template<typename SvdType,typename MatrixType> 
-void svd_verify_assert(const MatrixType& m)
+void svd_verify_assert(const MatrixType& m, bool fullOnly = false)
 {
   typedef typename MatrixType::Scalar Scalar;
   Index rows = m.rows();
@@ -489,8 +489,17 @@ void svd_verify_assert(const MatrixType& m)
   VERIFY_RAISES_ASSERT(svd.matrixV())
   svd.singularValues();
   VERIFY_RAISES_ASSERT(svd.solve(rhs))
-    
-  if (ColsAtCompileTime == Dynamic)
+
+  svd.compute(a, ComputeFullU);
+  svd.matrixU();
+  VERIFY_RAISES_ASSERT(svd.matrixV())
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+  svd.compute(a, ComputeFullV);
+  svd.matrixV();
+  VERIFY_RAISES_ASSERT(svd.matrixU())
+  VERIFY_RAISES_ASSERT(svd.solve(rhs))
+
+  if (!fullOnly && ColsAtCompileTime == Dynamic)
   {
     svd.compute(a, ComputeThinU);
     svd.matrixU();
diff --git a/test/symbolic_index.cpp b/test/symbolic_index.cpp
index b114cbb9565eb8ff4a403e918f37211123d7214a..a75ca1165ae5888c97714fecf3d3f8408672dc19 100644
--- a/test/symbolic_index.cpp
+++ b/test/symbolic_index.cpp
@@ -58,15 +58,15 @@ void check_symbolic_index()
   VERIFY( is_same_type( fix<9>()/2, int(9/2) ) );
 
   VERIFY( is_same_symb( lastp1-1, last, size) );
-  VERIFY( is_same_symb( lastp1-fix<1>, last, size) );
+  VERIFY( is_same_symb( lastp1-fix<1>(), last, size) );
 
   VERIFY_IS_EQUAL( ( (last*5-2)/3 ).eval(last=size-1), ((size-1)*5-2)/3 );
-  VERIFY_IS_EQUAL( ( (last*fix<5>-fix<2>)/fix<3> ).eval(last=size-1), ((size-1)*5-2)/3 );
+  VERIFY_IS_EQUAL( ( (last*fix<5>()-fix<2>())/fix<3>() ).eval(last=size-1), ((size-1)*5-2)/3 );
   VERIFY_IS_EQUAL( ( -last*lastp1  ).eval(last=size-1), -(size-1)*size );
   VERIFY_IS_EQUAL( ( lastp1-3*last  ).eval(last=size-1), size- 3*(size-1) );
   VERIFY_IS_EQUAL( ( (lastp1-3*last)/lastp1  ).eval(last=size-1), (size- 3*(size-1))/size );
 
-#if EIGEN_HAS_CXX14
+#if EIGEN_HAS_CXX14_VARIABLE_TEMPLATES
   {
     struct x_tag {};  static const symbolic::SymbolExpr<x_tag> x;
     struct y_tag {};  static const symbolic::SymbolExpr<y_tag> y;
diff --git a/test/unalignedassert.cpp b/test/unalignedassert.cpp
deleted file mode 100644
index 120cc42bbf45945fc9761ef1074204be2cbea8ab..0000000000000000000000000000000000000000
--- a/test/unalignedassert.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#if defined(EIGEN_TEST_PART_1)
-  // default
-#elif defined(EIGEN_TEST_PART_2)
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 16
-  #define EIGEN_MAX_ALIGN_BYTES 16
-#elif defined(EIGEN_TEST_PART_3)
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 32
-  #define EIGEN_MAX_ALIGN_BYTES 32
-#elif defined(EIGEN_TEST_PART_4)
-  #define EIGEN_MAX_STATIC_ALIGN_BYTES 64
-  #define EIGEN_MAX_ALIGN_BYTES 64
-#endif
-
-#include "main.h"
-
-typedef Matrix<float,  6,1> Vector6f;
-typedef Matrix<float,  8,1> Vector8f;
-typedef Matrix<float, 12,1> Vector12f;
-
-typedef Matrix<double, 5,1> Vector5d;
-typedef Matrix<double, 6,1> Vector6d;
-typedef Matrix<double, 7,1> Vector7d;
-typedef Matrix<double, 8,1> Vector8d;
-typedef Matrix<double, 9,1> Vector9d;
-typedef Matrix<double,10,1> Vector10d;
-typedef Matrix<double,12,1> Vector12d;
-
-struct TestNew1
-{
-  MatrixXd m; // good: m will allocate its own array, taking care of alignment.
-  TestNew1() : m(20,20) {}
-};
-
-struct TestNew2
-{
-  Matrix3d m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned,
-              // 8-byte alignment is good enough here, which we'll get automatically
-};
-
-struct TestNew3
-{
-  Vector2f m; // good: m's size isn't a multiple of 16 bytes, so m doesn't have to be 16-byte aligned
-};
-
-struct TestNew4
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  Vector2d m;
-  float f; // make the struct have sizeof%16!=0 to make it a little more tricky when we allow an array of 2 such objects
-};
-
-struct TestNew5
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
-  float f; // try the f at first -- the EIGEN_ALIGN_MAX attribute of m should make that still work
-  Matrix4f m;
-};
-
-struct TestNew6
-{
-  Matrix<float,2,2,DontAlign> m; // good: no alignment requested
-  float f;
-};
-
-template<bool Align> struct Depends
-{
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(Align)
-  Vector2d m;
-  float f;
-};
-
-template<typename T>
-void check_unalignedassert_good()
-{
-  T *x, *y;
-  x = new T;
-  delete x;
-  y = new T[2];
-  delete[] y;
-}
-
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-template<typename T>
-void construct_at_boundary(int boundary)
-{
-  char buf[sizeof(T)+256];
-  size_t _buf = reinterpret_cast<internal::UIntPtr>(buf);
-  _buf += (EIGEN_MAX_ALIGN_BYTES - (_buf % EIGEN_MAX_ALIGN_BYTES)); // make 16/32/...-byte aligned
-  _buf += boundary; // make exact boundary-aligned
-  T *x = ::new(reinterpret_cast<void*>(_buf)) T;
-  x[0].setZero(); // just in order to silence warnings
-  x->~T();
-}
-#endif
-
-void unalignedassert()
-{
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  construct_at_boundary<Vector2f>(4);
-  construct_at_boundary<Vector3f>(4);
-  construct_at_boundary<Vector4f>(16);
-  construct_at_boundary<Vector6f>(4);
-  construct_at_boundary<Vector8f>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Vector12f>(16);
-  construct_at_boundary<Matrix2f>(16);
-  construct_at_boundary<Matrix3f>(4);
-  construct_at_boundary<Matrix4f>(EIGEN_MAX_ALIGN_BYTES);
-
-  construct_at_boundary<Vector2d>(16);
-  construct_at_boundary<Vector3d>(4);
-  construct_at_boundary<Vector4d>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Vector5d>(4);
-  construct_at_boundary<Vector6d>(16);
-  construct_at_boundary<Vector7d>(4);
-  construct_at_boundary<Vector8d>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Vector9d>(4);
-  construct_at_boundary<Vector10d>(16);
-  construct_at_boundary<Vector12d>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Matrix2d>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Matrix3d>(4);
-  construct_at_boundary<Matrix4d>(EIGEN_MAX_ALIGN_BYTES);
-
-  construct_at_boundary<Vector2cf>(16);
-  construct_at_boundary<Vector3cf>(4);
-  construct_at_boundary<Vector2cd>(EIGEN_MAX_ALIGN_BYTES);
-  construct_at_boundary<Vector3cd>(16);
-#endif
-
-  check_unalignedassert_good<TestNew1>();
-  check_unalignedassert_good<TestNew2>();
-  check_unalignedassert_good<TestNew3>();
-
-  check_unalignedassert_good<TestNew4>();
-  check_unalignedassert_good<TestNew5>();
-  check_unalignedassert_good<TestNew6>();
-  check_unalignedassert_good<Depends<true> >();
-
-#if EIGEN_MAX_STATIC_ALIGN_BYTES>0
-  if(EIGEN_MAX_ALIGN_BYTES>=16)
-  {
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4f>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12f>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector6d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector10d>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector12d>(8));
-    // Complexes are disabled because the compiler might aggressively vectorize
-    // the initialization of complex coeffs to 0 before we can check for alignedness
-    //VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cf>(8));
-    VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4i>(8));
-  }
-  for(int b=8; b<EIGEN_MAX_ALIGN_BYTES; b+=8)
-  {
-    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector8f>(b));
-    if(b<64)  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4f>(b));
-    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector4d>(b));
-    if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix2d>(b));
-    if(b<128) VERIFY_RAISES_ASSERT(construct_at_boundary<Matrix4d>(b));
-    //if(b<32)  VERIFY_RAISES_ASSERT(construct_at_boundary<Vector2cd>(b));
-  }
-#endif
-}
-
-EIGEN_DECLARE_TEST(unalignedassert)
-{
-  CALL_SUBTEST(unalignedassert());
-}
diff --git a/test/vectorization_logic.cpp b/test/vectorization_logic.cpp
index 7a853888b4117d9663b96093ae47a67f893b4f6b..19375191d5de4a6e172e30d6e5a891965fc48121 100644
--- a/test/vectorization_logic.cpp
+++ b/test/vectorization_logic.cpp
@@ -39,11 +39,15 @@ bool test_assign(const Dst&, const Src&, int traversal, int unrolling)
 {
   EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src);
   typedef internal::copy_using_evaluator_traits<internal::evaluator<Dst>,internal::evaluator<Src>, internal::assign_op<typename Dst::Scalar,typename Src::Scalar> > traits;
-  bool res = traits::Traversal==traversal;
-  if(unrolling==InnerUnrolling+CompleteUnrolling)
-    res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling);
-  else
-    res = res && int(traits::Unrolling)==unrolling;
+  // If traversal or unrolling are negative, ignore.
+  bool res = traversal > -1 ? traits::Traversal==traversal : true;
+  if (unrolling > -1) {
+    if(unrolling==InnerUnrolling+CompleteUnrolling) {
+      res = res && (int(traits::Unrolling)==InnerUnrolling || int(traits::Unrolling)==CompleteUnrolling);
+    } else {
+      res = res && int(traits::Unrolling)==unrolling;
+    }
+  }
   if(!res)
   {
     std::cerr << "Src: " << demangle_flags(Src::Flags) << std::endl;
@@ -159,11 +163,11 @@ struct vectorization_logic
       EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
 
     VERIFY(test_assign(Matrix1(),Matrix1()+Matrix1(),
-      (Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal,
+      (int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal,
       CompleteUnrolling));
 
     VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
-      EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
+      EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
                                 : LinearTraversal, CompleteUnrolling));
 
     VERIFY(test_assign(Matrix44c().col(1),Matrix44c().col(2)+Matrix44c().col(3),
@@ -178,21 +182,15 @@ struct vectorization_logic
       typedef Matrix<Scalar,3,1,ColMajor> Vector3;
       VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
         LinearTraversal,CompleteUnrolling));
-      VERIFY(test_assign(Vector3(),Vector3()+Vector3(),
-        sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal), CompleteUnrolling));
-      VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
-        EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
-                                  : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal),
-        ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling));
+      // Vectorization depends on too many factors - ignore.
+      VERIFY(test_assign(Vector3(),Vector3()+Vector3(), -1, CompleteUnrolling));
 
       VERIFY(test_assign(Matrix3(),Matrix3().cwiseProduct(Matrix3()),
         LinearVectorizedTraversal,CompleteUnrolling));
 
+      // Vectorization depends on too many factors - ignore.
       VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
-        sizeof(Scalar)==16        ? InnerVectorizedTraversal  :
-        EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal :
-                                    LinearTraversal,
-        NoUnrolling));
+        -1, NoUnrolling));
 
       VERIFY(test_assign(Matrix11(), Matrix11()+Matrix11(),InnerVectorizedTraversal,CompleteUnrolling));
 
@@ -277,12 +275,20 @@ struct vectorization_logic_half
   };
   static void run()
   {
+    // Some half-packets have a byte size < EIGEN_MIN_ALIGN_BYTES (e.g. Packet2f),
+    // which causes many of these tests to fail since they don't vectorize if
+    // EIGEN_UNALIGNED_VECTORIZE is 0 (the matrix is assumed unaligned).
+    // Adjust the matrix sizes to account for these alignment issues.
+    enum { PacketBytes = sizeof(Scalar)*PacketSize };
+    enum { MinVSize = EIGEN_UNALIGNED_VECTORIZE ? int(PacketSize)
+                             : int(PacketBytes) >= EIGEN_MIN_ALIGN_BYTES ? int(PacketSize)
+                             : (EIGEN_MIN_ALIGN_BYTES + sizeof(Scalar) - 1) / sizeof(Scalar) };
     
-    typedef Matrix<Scalar,PacketSize,1> Vector1;
-    typedef Matrix<Scalar,PacketSize,PacketSize> Matrix11;
-    typedef Matrix<Scalar,5*PacketSize,7,ColMajor> Matrix57;
-    typedef Matrix<Scalar,3*PacketSize,5,ColMajor> Matrix35;
-    typedef Matrix<Scalar,5*PacketSize,7,DontAlign|ColMajor> Matrix57u;
+    typedef Matrix<Scalar,MinVSize,1> Vector1;
+    typedef Matrix<Scalar,MinVSize,MinVSize> Matrix11;
+    typedef Matrix<Scalar,5*MinVSize,7,ColMajor> Matrix57;
+    typedef Matrix<Scalar,3*MinVSize,5,ColMajor> Matrix35;
+    typedef Matrix<Scalar,5*MinVSize,7,DontAlign|ColMajor> Matrix57u;
 
     typedef Matrix<Scalar,
         (PacketSize==16 ? 8 : PacketSize==8 ? 4 : PacketSize==4 ? 2 : PacketSize==2 ? 1 : /*PacketSize==1 ?*/ 1),
@@ -296,20 +302,20 @@ struct vectorization_logic_half
 
     // this type is made such that it can only be vectorized when viewed as a linear 1D vector
     typedef Matrix<Scalar,
-        (PacketSize==16 ?  4 : PacketSize==8 ? 4 : PacketSize==4 ? 6 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
-        (PacketSize==16 ? 12 : PacketSize==8 ? 6 : PacketSize==4 ? 2 : PacketSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
+        (MinVSize==16 ?  4 : MinVSize==8 ? 4 : MinVSize==4 ? 6 : MinVSize==2 ? ((Matrix11::Flags&RowMajorBit)?2:3) : /*PacketSize==1 ?*/ 1),
+        (MinVSize==16 ? 12 : MinVSize==8 ? 6 : MinVSize==4 ? 2 : MinVSize==2 ? ((Matrix11::Flags&RowMajorBit)?3:2) : /*PacketSize==1 ?*/ 3)
       > Matrix3;
     
-    #if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
+#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT
     VERIFY(test_assign(Vector1(),Vector1(),
       InnerVectorizedTraversal,CompleteUnrolling));
     VERIFY(test_assign(Vector1(),Vector1()+Vector1(),
       InnerVectorizedTraversal,CompleteUnrolling));
-    VERIFY(test_assign(Vector1(),Vector1().template segment<PacketSize>(0).derived(),
+    VERIFY(test_assign(Vector1(),Vector1().template segment<MinVSize>(0).derived(),
       EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling));
     VERIFY(test_assign(Vector1(),Scalar(2.1)*Vector1()-Vector1(),
       InnerVectorizedTraversal,CompleteUnrolling));
-    VERIFY(test_assign(Vector1(),(Scalar(2.1)*Vector1().template segment<PacketSize>(0)-Vector1().template segment<PacketSize>(0)).derived(),
+    VERIFY(test_assign(Vector1(),(Scalar(2.1)*Vector1().template segment<MinVSize>(0)-Vector1().template segment<MinVSize>(0)).derived(),
       EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : LinearVectorizedTraversal,CompleteUnrolling));
     VERIFY(test_assign(Vector1(),Vector1().cwiseProduct(Vector1()),
       InnerVectorizedTraversal,CompleteUnrolling));
@@ -324,26 +330,23 @@ struct vectorization_logic_half
       EIGEN_UNALIGNED_VECTORIZE ? InnerUnrolling : NoUnrolling));
 
     VERIFY(test_assign(Matrix1u(),Matrix1()+Matrix1(),
-      EIGEN_UNALIGNED_VECTORIZE ? ((Matrix1::InnerSizeAtCompileTime % PacketSize)==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
+      EIGEN_UNALIGNED_VECTORIZE ? ((int(Matrix1::InnerSizeAtCompileTime) % int(PacketSize))==0 ? InnerVectorizedTraversal : LinearVectorizedTraversal) : LinearTraversal,CompleteUnrolling));
         
     if(PacketSize>1)
     {
       typedef Matrix<Scalar,3,3,ColMajor> Matrix33c;
       VERIFY(test_assign(Matrix33c().row(2),Matrix33c().row(1)+Matrix33c().row(1),
         LinearTraversal,CompleteUnrolling));
-      VERIFY(test_assign(Matrix33c().col(0),Matrix33c().col(1)+Matrix33c().col(1),
-        EIGEN_UNALIGNED_VECTORIZE ? (sizeof(Scalar)==16 ? InnerVectorizedTraversal : LinearVectorizedTraversal)
-                                  : (sizeof(Scalar)==16 ? SliceVectorizedTraversal : LinearTraversal),
-        ((!EIGEN_UNALIGNED_VECTORIZE) && (sizeof(Scalar)==16)) ? NoUnrolling : CompleteUnrolling));
-              
+
+      // Unrolling depends on read costs and unroll limits, which vary - ignore.   
       VERIFY(test_assign(Matrix3(),Matrix3().cwiseQuotient(Matrix3()),
-        PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal,CompleteUnrolling));
+        PacketTraits::HasDiv ? LinearVectorizedTraversal : LinearTraversal, -1));
         
       VERIFY(test_assign(Matrix<Scalar,17,17>(),Matrix<Scalar,17,17>()+Matrix<Scalar,17,17>(),
         sizeof(Scalar)==16 ? InnerVectorizedTraversal : (EIGEN_UNALIGNED_VECTORIZE ? LinearVectorizedTraversal : LinearTraversal),
         NoUnrolling));
         
-      VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(2,3)+Matrix<Scalar,17,17>().template block<PacketSize,PacketSize>(8,4),
+      VERIFY(test_assign(Matrix11(),Matrix<Scalar,17,17>().template block<MinVSize,MinVSize>(2,3)+Matrix<Scalar,17,17>().template block<MinVSize,MinVSize>(8,4),
         EIGEN_UNALIGNED_VECTORIZE ? InnerVectorizedTraversal : DefaultTraversal,InnerUnrolling+CompleteUnrolling));
   
 
@@ -357,7 +360,7 @@ struct vectorization_logic_half
     VERIFY(test_redux(Vector1(),
       LinearVectorizedTraversal,CompleteUnrolling));
 
-    VERIFY(test_redux(Matrix<Scalar,PacketSize,3>(),
+    VERIFY(test_redux(Matrix<Scalar,MinVSize,3>(),
       LinearVectorizedTraversal,CompleteUnrolling));
 
     VERIFY(test_redux(Matrix3(),
@@ -379,9 +382,9 @@ struct vectorization_logic_half
             Matrix<Scalar,EIGEN_PLAIN_ENUM_MAX(2,PacketSize),EIGEN_PLAIN_ENUM_MAX(2,PacketSize)>
             >(DefaultTraversal,PacketSize>4?InnerUnrolling:CompleteUnrolling)));
 
-    VERIFY((test_assign(Matrix57(), Matrix<Scalar,5*PacketSize,3>()*Matrix<Scalar,3,7>(),
-                        InnerVectorizedTraversal, InnerUnrolling+CompleteUnrolling)));
-    #endif
+    VERIFY((test_assign(Matrix57(), Matrix<Scalar, 5 * MinVSize, 3>() * Matrix<Scalar, 3, 7>(),
+                        InnerVectorizedTraversal, InnerUnrolling + CompleteUnrolling)));
+#endif
   }
 };
 
diff --git a/unsupported/CMakeLists.txt b/unsupported/CMakeLists.txt
index 34408c017004409072f71b860477790cb767dadf..67d1f6262a494fca6d222498e88430d20df4becc 100644
--- a/unsupported/CMakeLists.txt
+++ b/unsupported/CMakeLists.txt
@@ -2,7 +2,7 @@ add_subdirectory(Eigen)
 if(EIGEN_BUILD_DOC)
   add_subdirectory(doc EXCLUDE_FROM_ALL)
 endif()
-if(BUILD_TESTING)
+if(EIGEN_BUILD_TESTING)
   if(EIGEN_LEAVE_TEST_IN_ALL_TARGET)
     add_subdirectory(test) # can't do EXCLUDE_FROM_ALL here, breaks CTest
   else()
diff --git a/unsupported/Eigen/AutoDiff b/unsupported/Eigen/AutoDiff
index 7a4ff460ce505d1801af0ef383525b1beb5bb628..e10875e71cf212b31156cbd3fb9b1facc6656c0f 100644
--- a/unsupported/Eigen/AutoDiff
+++ b/unsupported/Eigen/AutoDiff
@@ -10,6 +10,8 @@
 #ifndef EIGEN_AUTODIFF_MODULE
 #define EIGEN_AUTODIFF_MODULE
 
+#include "../../Eigen/Core"
+
 namespace Eigen {
 
 /**
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index d73c6008d92f2a91e603b8d1e92f8cf2ec7116ea..0938bb554da43256277e42971b355a242f96cf4f 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -41,14 +41,6 @@
 #include <random>
 #include <thread>
 
-#ifdef _WIN32
-#include <windows.h>
-#elif defined(__APPLE__)
-#include <mach/mach_time.h>
-#else
-#include <time.h>
-#endif
-
 #if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
 #include "ThreadPool"
 #endif
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index 9b6f14204e06064174aaf3f3520796ed7c9f287e..2f65b1b0e9f60a4c3ac6baabdeb68d6d15fd1ca6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -3,8 +3,6 @@
 Tensors are multidimensional arrays of elements. Elements are typically scalars,
 but more complex types such as strings are also supported.
 
-[TOC]
-
 ## Tensor Classes
 
 You can manipulate a tensor with one of the following classes.  They all are in
@@ -21,7 +19,7 @@ matrix.
 Tensors of this class are resizable.  For example, if you assign a tensor of a
 different size to a Tensor, that tensor is resized to match its new value.
 
-#### Constructor `Tensor<data_type, rank>(size0, size1, ...)`
+#### Constructor Tensor<data_type, rank>(size0, size1, ...)
 
 Constructor for a Tensor.  The constructor must be passed `rank` integers
 indicating the sizes of the instance along each of the the `rank`
@@ -34,7 +32,7 @@ dimensions.
     // Resize t_3d by assigning a tensor of different sizes, but same rank.
     t_3d = Tensor<float, 3>(3, 4, 3);
 
-#### Constructor `Tensor<data_type, rank>(size_array)`
+#### Constructor Tensor<data_type, rank>(size_array)
 
 Constructor where the sizes for the constructor are specified as an array of
 values instead of an explicitly list of parameters.  The array type to use is
@@ -45,7 +43,7 @@ from an initializer list.
     Tensor<string, 2> t_2d({5, 7});
 
 
-### Class `TensorFixedSize<data_type, Sizes<size0, size1, ...>>`
+### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>>
 
 Class to use for tensors of fixed size, where the size is known at compile
 time.  Fixed sized tensors can provide very fast computations because all their
@@ -57,7 +55,7 @@ tensor data is held onto the stack and does not cause heap allocation and free.
     // Create a 4 x 3 tensor of floats.
     TensorFixedSize<float, Sizes<4, 3>> t_4x3;
 
-### Class `TensorMap<Tensor<data_type, rank>>`
+### Class TensorMap<Tensor<data_type, rank>>
 
 This is the class to use to create a tensor on top of memory allocated and
 owned by another part of your code.  It allows to view any piece of allocated
@@ -67,7 +65,7 @@ data are stored.
 A TensorMap is not resizable because it does not own the memory where its data
 are stored.
 
-#### Constructor `TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)`
+#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)
 
 Constructor for a Tensor.  The constructor must be passed a pointer to the
 storage for the data, and "rank" size attributes.  The storage has to be
@@ -87,13 +85,13 @@ large enough to hold all the data.
     TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12);
 
 
-#### Class `TensorRef`
+#### Class TensorRef
 
 See Assigning to a TensorRef below.
 
 ## Accessing Tensor Elements
 
-#### `<data_type> tensor(index0, index1...)`
+#### <data_type> tensor(index0, index1...)
 
 Return the element at position `(index0, index1...)` in tensor
 `tensor`.  You must pass as many parameters as the rank of `tensor`.
@@ -278,7 +276,7 @@ Simiarly, assigning an expression to a TensorMap causes its evaluation.  Like
 tensors of type TensorFixedSize, TensorMaps cannot be resized so they have to
 have the rank and sizes of the expression that are assigned to them.
 
-#### Calling `eval()`.
+#### Calling eval().
 
 When you compute large composite expressions, you sometimes want to tell Eigen
 that an intermediate value in the expression tree is worth evaluating ahead of
@@ -355,7 +353,7 @@ call for the right hand side:
         (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
 
 
-#### Assigning to a `TensorRef`.
+#### Assigning to a TensorRef.
 
 If you need to access only a few elements from the value of an expression you
 can avoid materializing the value in a full tensor by using a TensorRef.
@@ -455,24 +453,24 @@ memory for tensors with cuda.
 In the documentation of the tensor methods and Operation we mention datatypes
 that are tensor-type specific:
 
-#### `<Tensor-Type>::``Dimensions`
+#### <Tensor-Type>::Dimensions
 
 Acts like an array of ints.  Has an `int size` attribute, and can be
 indexed like an array to access individual values.  Used to represent the
 dimensions of a tensor.  See `dimensions()`.
 
-#### `<Tensor-Type>::``Index`
+#### <Tensor-Type>::Index
 
 Acts like an `int`.  Used for indexing tensors along their dimensions.  See
 `operator()`, `dimension()`, and `size()`.
 
-#### `<Tensor-Type>::``Scalar`
+#### <Tensor-Type>::Scalar
 
 Represents the datatype of individual tensor elements.  For example, for a
 `Tensor<float>`, `Scalar` is the type `float`.  See
 `setConstant()`.
 
-#### `<Operation>`
+#### <Operation>
 
 We use this pseudo type to indicate that a tensor Operation is returned by a
 method.  We indicate in the text the type and dimensions of the tensor that the
@@ -492,7 +490,7 @@ Tensor, TensorFixedSize, and TensorMap.
 
 ## Metadata
 
-### `int NumDimensions`
+### int NumDimensions
 
 Constant value indicating the number of dimensions of a Tensor.  This is also
 known as the tensor "rank".
@@ -501,7 +499,7 @@ known as the tensor "rank".
       cout << "Dims " << a.NumDimensions;
       => Dims 2
 
-### `Dimensions dimensions()`
+### Dimensions dimensions()
 
 Returns an array-like object representing the dimensions of the tensor.
 The actual type of the `dimensions()` result is `<Tensor-Type>::``Dimensions`.
@@ -519,7 +517,7 @@ If you use a C++11 compiler, you can use `auto` to simplify the code:
          << ", dim 1: " << d[1];
     => Dim size: 2, dim 0: 3, dim 1: 4
 
-### `Index dimension(Index n)`
+### Index dimension(Index n)
 
 Returns the n-th dimension of the tensor.  The actual type of the
 `dimension()` result is `<Tensor-Type>::``Index`, but you can
@@ -530,7 +528,7 @@ always use it like an int.
       cout << "Dim 1: " << dim1;
       => Dim 1: 4
 
-### `Index size()`
+### Index size()
 
 Returns the total number of elements in the tensor.  This is the product of all
 the tensor dimensions.  The actual type of the `size()` result is
@@ -605,7 +603,7 @@ You can use one of the methods below to initialize the tensor memory.  These
 have an immediate effect on the tensor and return the tensor itself as a
 result.  These are not tensor Operations which delay evaluation.
 
-### `<Tensor-Type> setConstant(const Scalar& val)`
+### <Tensor-Type> setConstant(const Scalar& val)
 
 Sets all elements of the tensor to the constant value `val`.  `Scalar`
 is the type of data stored in the tensor.  You can pass any value that is
@@ -633,7 +631,7 @@ has a copy constructor and an `operator=()`:
     yolo yolo yolo
 
 
-### `<Tensor-Type> setZero()`
+### <Tensor-Type> setZero()
 
 Fills the tensor with zeros.  Equivalent to `setConstant(Scalar(0))`.
 Returns the tensor itself in case you want to chain another call.
@@ -647,7 +645,7 @@ Returns the tensor itself in case you want to chain another call.
     0 0 0 0
 
 
-### `<Tensor-Type> setValues({..initializer_list})`
+### <Tensor-Type> setValues({..initializer_list})
 
 Fills the tensor with explicit values specified in a std::initializer_list.
 The type of the initializer list depends on the type and rank of the tensor.
@@ -683,7 +681,7 @@ code only sets the values of the first row of the tensor.
     10   20   30
     1000 1000 1000
 
-### `<Tensor-Type> setRandom()`
+### <Tensor-Type> setRandom()
 
 Fills the tensor with random values.  Returns the tensor itself in case you
 want to chain another call.
@@ -750,7 +748,7 @@ values of a tensor expression, the expression must either be evaluated or
 wrapped in a TensorRef.
 
 
-### `Scalar* data()` and `const Scalar* data() const`
+### Scalar* data() and const Scalar* data() const
 
 Returns a pointer to the storage for the tensor.  The pointer is const if the
 tensor was const.  This allows direct access to the data.  The layout of the
@@ -778,7 +776,7 @@ The chain of Operation is evaluated lazily, typically when it is assigned to a
 tensor.  See "Controlling when Expression are Evaluated" for more details about
 their evaluation.
 
-### `<Operation> constant(const Scalar& val)`
+### <Operation> constant(const Scalar& val)
 
 Returns a tensor of the same type and dimensions as the original tensor but
 where all elements have the value `val`.
@@ -806,7 +804,7 @@ tensor, or multiply every element of a tensor by a scalar.
     0.6 0.6 0.6
     0.6 0.6 0.6
 
-### `<Operation> random()`
+### <Operation> random()
 
 Returns a tensor of the same type and dimensions as the current tensor
 but where all elements have random values.
@@ -836,7 +834,7 @@ All these operations take a single input tensor as argument and return a tensor
 of the same type and dimensions as the tensor to which they are applied.  The
 requested operations are applied to each element independently.
 
-### `<Operation> operator-()`
+### <Operation> operator-()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the opposite values of the original tensor.
@@ -855,42 +853,42 @@ containing the opposite values of the original tensor.
     -1 -1 -1
     -1 -1 -1
 
-### `<Operation> sqrt()`
+### <Operation> sqrt()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the square roots of the original tensor.
 
-### `<Operation> rsqrt()`
+### <Operation> rsqrt()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the inverse square roots of the original tensor.
 
-### `<Operation> square()`
+### <Operation> square()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the squares of the original tensor values.
 
-### `<Operation> inverse()`
+### <Operation> inverse()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the inverse of the original tensor values.
 
-### `<Operation> exp()`
+### <Operation> exp()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the exponential of the original tensor.
 
-### `<Operation> log()`
+### <Operation> log()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the natural logarithms of the original tensor.
 
-### `<Operation> abs()`
+### <Operation> abs()
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the absolute values of the original tensor.
 
-### `<Operation> pow(Scalar exponent)`
+### <Operation> pow(Scalar exponent)
 
 Returns a tensor of the same type and dimensions as the original tensor
 containing the coefficients of the original tensor to the power of the
@@ -917,17 +915,17 @@ cubic roots of an int Tensor:
     0 1 2
     3 4 5
 
-### `<Operation>  operator * (Scalar scale)`
+### <Operation>  operator * (Scalar scale)
 
 Multiplies all the coefficients of the input tensor by the provided scale.
 
-### `<Operation>  cwiseMax(Scalar threshold)`
+### <Operation>  cwiseMax(Scalar threshold)
 TODO
 
-### `<Operation>  cwiseMin(Scalar threshold)`
+### <Operation>  cwiseMin(Scalar threshold)
 TODO
 
-### `<Operation>  unaryExpr(const CustomUnaryOp& func)`
+### <Operation>  unaryExpr(const CustomUnaryOp& func)
 TODO
 
 
@@ -939,39 +937,39 @@ dimensions as the tensors to which they are applied, and unless otherwise
 specified it is also of the same type. The requested operations are applied to
 each pair of elements independently.
 
-### `<Operation> operator+(const OtherDerived& other)`
+### <Operation> operator+(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise sums of the inputs.
 
-### `<Operation> operator-(const OtherDerived& other)`
+### <Operation> operator-(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise differences of the inputs.
 
-### `<Operation> operator*(const OtherDerived& other)`
+### <Operation> operator*(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise products of the inputs.
 
-### `<Operation> operator/(const OtherDerived& other)`
+### <Operation> operator/(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise quotients of the inputs.
 
 This operator is not supported for integer types.
 
-### `<Operation> cwiseMax(const OtherDerived& other)`
+### <Operation> cwiseMax(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise maximums of the inputs.
 
-### `<Operation> cwiseMin(const OtherDerived& other)`
+### <Operation> cwiseMin(const OtherDerived& other)
 
 Returns a tensor of the same type and dimensions as the input tensors
 containing the coefficient wise mimimums of the inputs.
 
-### `<Operation> Logical operators`
+### <Operation> Logical operators
 
 The following logical operators are supported as well:
 
@@ -1129,50 +1127,50 @@ scalar, represented as a zero-dimension tensor.
     276
 
 
-### `<Operation> sum(const Dimensions& new_dims)`
-### `<Operation> sum()`
+### <Operation> sum(const Dimensions& new_dims)
+### <Operation> sum()
 
 Reduce a tensor using the sum() operator.  The resulting values
 are the sum of the reduced values.
 
-### `<Operation> mean(const Dimensions& new_dims)`
-### `<Operation> mean()`
+### <Operation> mean(const Dimensions& new_dims)
+### <Operation> mean()
 
 Reduce a tensor using the mean() operator.  The resulting values
 are the mean of the reduced values.
 
-### `<Operation> maximum(const Dimensions& new_dims)`
-### `<Operation> maximum()`
+### <Operation> maximum(const Dimensions& new_dims)
+### <Operation> maximum()
 
 Reduce a tensor using the maximum() operator.  The resulting values are the
 largest of the reduced values.
 
-### `<Operation> minimum(const Dimensions& new_dims)`
-### `<Operation> minimum()`
+### <Operation> minimum(const Dimensions& new_dims)
+### <Operation> minimum()
 
 Reduce a tensor using the minimum() operator.  The resulting values
 are the smallest of the reduced values.
 
-### `<Operation> prod(const Dimensions& new_dims)`
-### `<Operation> prod()`
+### <Operation> prod(const Dimensions& new_dims)
+### <Operation> prod()
 
 Reduce a tensor using the prod() operator.  The resulting values
 are the product of the reduced values.
 
-### `<Operation> all(const Dimensions& new_dims)`
-### `<Operation> all()`
+### <Operation> all(const Dimensions& new_dims)
+### <Operation> all()
 Reduce a tensor using the all() operator.  Casts tensor to bool and then checks
 whether all elements are true.  Runs through all elements rather than
 short-circuiting, so may be significantly inefficient.
 
-### `<Operation> any(const Dimensions& new_dims)`
-### `<Operation> any()`
+### <Operation> any(const Dimensions& new_dims)
+### <Operation> any()
 Reduce a tensor using the any() operator.  Casts tensor to bool and then checks
 whether any element is true.  Runs through all elements rather than
 short-circuiting, so may be significantly inefficient.
 
 
-### `<Operation> reduce(const Dimensions& new_dims, const Reducer& reducer)`
+### <Operation> reduce(const Dimensions& new_dims, const Reducer& reducer)
 
 Reduce a tensor using a user-defined reduction operator.  See `SumReducer`
 in TensorFunctors.h for information on how to implement a reduction operator.
@@ -1208,8 +1206,8 @@ Example: Trace along 2 dimensions.
     15
 
 
-### `<Operation> trace(const Dimensions& new_dims)`
-### `<Operation> trace()`
+### <Operation> trace(const Dimensions& new_dims)
+### <Operation> trace()
 
 As a special case, if no parameter is passed to the operation, trace is computed
 along *all* dimensions of the input tensor.
@@ -1259,18 +1257,18 @@ dd a comment to this line
     1  3  6
     4  9 15
 
-### `<Operation> cumsum(const Index& axis)`
+### <Operation> cumsum(const Index& axis)
 
 Perform a scan by summing consecutive entries.
 
-### `<Operation> cumprod(const Index& axis)`
+### <Operation> cumprod(const Index& axis)
 
 Perform a scan by multiplying consecutive entries.
 
 
 ## Convolutions
 
-### `<Operation> convolve(const Kernel& kernel, const Dimensions& dims)`
+### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
 
 Returns a tensor that is the output of the convolution of the input tensor with the kernel,
 along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
@@ -1313,7 +1311,7 @@ These operations return a Tensor with different dimensions than the original
 Tensor.  They can be used to access slices of tensors, see them with different
 dimensions, or pad tensors with additional data.
 
-### `<Operation> reshape(const Dimensions& new_dims)`
+### <Operation> reshape(const Dimensions& new_dims)
 
 Returns a view of the input tensor that has been reshaped to the specified
 new dimensions.  The argument new_dims is an array of Index values.  The
@@ -1392,7 +1390,7 @@ Note that "b" itself was not reshaped but that instead the assignment is done to
 the reshape view of b.
 
 
-### `<Operation> shuffle(const Shuffle& shuffle)`
+### <Operation> shuffle(const Shuffle& shuffle)
 
 Returns a copy of the input tensor whose dimensions have been
 reordered according to the specified permutation. The argument shuffle
@@ -1433,7 +1431,7 @@ Let's rewrite the previous example to take advantage of this feature:
     output.shuffle({2, 0, 1}) = input;
 
 
-### `<Operation> stride(const Strides& strides)`
+### <Operation> stride(const Strides& strides)
 
 Returns a view of the input tensor that strides (skips stride-1
 elements) along each of the dimensions.  The argument strides is an
@@ -1459,7 +1457,7 @@ It is possible to assign a tensor to a stride:
     output.stride({2, 3, 4}) = input;
 
 
-### `<Operation> slice(const StartIndices& offsets, const Sizes& extents)`
+### <Operation> slice(const StartIndices& offsets, const Sizes& extents)
 
 Returns a sub-tensor of the given tensor. For each dimension i, the slice is
 made of the coefficients stored between offset[i] and offset[i] + extents[i] in
@@ -1485,7 +1483,7 @@ the input tensor.
      600   700
 
 
-### `<Operation> chip(const Index offset, const Index dim)`
+### <Operation> chip(const Index offset, const Index dim)
 
 A chip is a special kind of slice. It is the subtensor at the given offset in
 the dimension dim. The returned tensor has one fewer dimension than the input
@@ -1536,7 +1534,7 @@ lvalue. For example:
          0     0     0
 
 
-### `<Operation> reverse(const ReverseDimensions& reverse)`
+### <Operation> reverse(const ReverseDimensions& reverse)
 
 Returns a view of the input tensor that reverses the order of the coefficients
 along a subset of the dimensions.  The argument reverse is an array of boolean
@@ -1566,7 +1564,7 @@ of a 2D tensor:
        0   100   200
 
 
-### `<Operation> broadcast(const Broadcast& broadcast)`
+### <Operation> broadcast(const Broadcast& broadcast)
 
 Returns a view of the input tensor in which the input is replicated one to many
 times.
@@ -1590,11 +1588,11 @@ made in each of the dimensions.
        0   100   200    0   100   200
      300   400   500  300   400   500
 
-### `<Operation> concatenate(const OtherDerived& other, Axis axis)`
+### <Operation> concatenate(const OtherDerived& other, Axis axis)
 
 TODO
 
-### `<Operation>  pad(const PaddingDimensions& padding)`
+### <Operation>  pad(const PaddingDimensions& padding)
 
 Returns a view of the input tensor in which the input is padded with zeros.
 
@@ -1619,7 +1617,7 @@ Returns a view of the input tensor in which the input is padded with zeros.
        0     0     0    0
 
 
-### `<Operation>  extract_patches(const PatchDims& patch_dims)`
+### <Operation>  extract_patches(const PatchDims& patch_dims)
 
 Returns a tensor of coefficient patches extracted from the input tensor, where
 each patch is of dimension specified by 'patch_dims'. The returned tensor has
@@ -1706,7 +1704,7 @@ This code results in the following output when the data layout is RowMajor:
     6 7
     10 11
 
-### `<Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)`
+### <Operation>  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)
 
 Returns a tensor of coefficient image patches extracted from the input tensor,
 which is expected to have dimensions ordered as follows (depending on the data
@@ -1763,7 +1761,7 @@ sizes:
 
 ## Special Operations
 
-### `<Operation> cast<T>()`
+### <Operation> cast<T>()
 
 Returns a tensor of type T with the same dimensions as the original tensor.
 The returned tensor contains the values of the original tensor converted to
@@ -1792,7 +1790,7 @@ but you can easily cast the tensors to floats to do the division:
     1 2 2
 
 
-### `<Operation>     eval()`
+### <Operation>     eval()
 
 TODO
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index 91a6f8d6c32ef763f7232ab681f501e3a0c9a7c3..8b8fb923523d6c42df61f5b9ce7e6fc877fd4094 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -99,18 +99,18 @@ struct TensorEvaluator<const TensorIndexTupleOp<ArgType>, Device>
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -240,7 +240,7 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_orig_impl(op.expression(), device),
         m_impl(op.expression().index_tuples().reduce(op.reduce_dims(), op.reduce_op()), device),
         m_return_dim(op.return_dim())
@@ -263,11 +263,11 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
     return m_impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
index 72f072cf2e4df90f49920908c845c11f665e5d99..e5811d63fa63b7fd98ef94bbcfd4f0335040e87e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -104,14 +104,14 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   static const int NumDims = XprType::NumDims;
 
   enum {
-    IsAligned         = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                        TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                        TensorEvaluator<RightArgType, Device>::PacketAccess,
-    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                        TensorEvaluator<RightArgType, Device>::BlockAccess,
-    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
-                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::PacketAccess),
+    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
     Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
     RawAccess         = TensorEvaluator<LeftArgType, Device>::RawAccess
   };
@@ -124,7 +124,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
       RightTensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+  TensorEvaluator(const XprType& op, const Device& device) :
       m_leftImpl(op.lhsExpression(), device),
       m_rightImpl(op.rhsExpression(), device)
   {
@@ -142,7 +142,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
     return m_rightImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
@@ -154,7 +154,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
       m_rightImpl.evalSubExprsIfNeededAsync(
@@ -163,7 +163,7 @@ struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device>
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 35b6458e5a813518d0ba0bcc7d3949d931914b2d..fcc7411afcaedacecc26f875bbd55db3d87bc9f5 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -513,34 +513,34 @@ class TensorBase<Derived, ReadOnlyAccessors>
     // Comparisons and tests.
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
-    operator<(const OtherDerived& other) const {
+    operator<(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
-    operator<=(const OtherDerived& other) const {
+    operator<=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
-    operator>(const OtherDerived& other) const {
+    operator>(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
-    operator>=(const OtherDerived& other) const {
+    operator>=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
-    operator==(const OtherDerived& other) const {
+    operator==(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
-    operator!=(const OtherDerived& other) const {
+    operator!=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
       return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
     }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
index 1e55d12c42fc2ece4035b743915eef10d996ea0a..243b3fb7bd68bc1896ef92da191b24b47460f6fa 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -242,7 +242,7 @@ class TensorBlockDescriptor {
     const DestinationBufferKind& kind() const { return m_kind; }
 
    private:
-    friend class TensorBlockDescriptor;
+    friend class TensorBlockDescriptor<NumDims, IndexType>;
 
     DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
 
@@ -706,7 +706,7 @@ class TensorMaterializedBlock {
     }
 
    private:
-    friend class TensorMaterializedBlock;
+    friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
 
     Storage(Scalar* data, const Dimensions& dimensions,
             const Dimensions& strides, bool materialized_in_output,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index fc75c8d9a6fcd1d5835b8e1668b021d97387460a..7449b046b2936bab7891a9299759fe800016eeb1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -127,7 +127,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
 
   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
- typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
 
   typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
@@ -138,14 +138,13 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : isCopy(false), nByOne(false), oneByN(false),
         m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
   {
 
     // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
-    // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
+    // and store the result in a scalar. Instead one should reshape the scalar into a N-D
     // tensor with N >= 1 of 1 element first and then broadcast.
     EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const InputDimensions& input_dims = m_impl.dimensions();
@@ -211,20 +210,20 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -411,25 +410,24 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
   {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Concatenates m_broadcast[dim] copies,
+    //    [v0, ..., vN, v0, ..., vN, ... ]
+    // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
-    Index dim, inputIndex;
-
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      dim = NumDims - 1;
-    } else {
-      dim = 0;
-    }
-
-    inputIndex = index % m_inputStrides[dim];
-    if (inputIndex + PacketSize <= m_inputStrides[dim]) {
+    // Size of flattened tensor.
+    const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
+                      m_inputStrides[NumDims - 1] : m_inputStrides[0];
+    Index inputIndex = index % M;
+    if (inputIndex + PacketSize <= M) {
       return m_impl.template packet<Unaligned>(inputIndex);
     } else {
       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       EIGEN_UNROLL_LOOP
       for (int i = 0; i < PacketSize; ++i) {
-        if (inputIndex > m_inputStrides[dim]-1) {
+        if (inputIndex > M - 1) {
           inputIndex = 0;
         }
         values[i] = m_impl.coeff(inputIndex++);
@@ -441,32 +439,30 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
   {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Interleaves m_broadcast[dim] copies,
+    //    [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
+    // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
-
-    EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
-    Index dim, inputIndex, outputOffset;
+    eigen_assert(index + PacketSize-1 < dimensions().TotalSize());
 
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      dim = 1;
-    } else {
-      dim = NumDims - 2;
-    }
+    const Index M = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ?
+                      m_broadcast[0] : m_broadcast[NumDims - 1];
 
-    inputIndex   = index / m_outputStrides[dim];
-    outputOffset = index % m_outputStrides[dim];
-    if (outputOffset + PacketSize <= m_outputStrides[dim]) {
-      values[0] = m_impl.coeff(inputIndex);
-      return internal::pload1<PacketReturnType>(values);
+    Index inputIndex   = index / M;
+    Index outputOffset = index % M;
+    if (outputOffset + PacketSize <= M) {
+      return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
     } else {
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
       EIGEN_UNROLL_LOOP
-      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
-        if (outputOffset + cur < m_outputStrides[dim]) {
+      for (int i = 0; i < PacketSize; ++i) {
+        if (outputOffset < M) {
           values[i] = m_impl.coeff(inputIndex);
+          ++outputOffset;
         } else {
           values[i] = m_impl.coeff(++inputIndex);
-          outputOffset = 0;
-          cur = 0;
+          outputOffset = 1;  // Next offset.
         }
       }
       return internal::pload<PacketReturnType>(values);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 7c6bbd180fba752e19db27b516be652d711d7ca2..376457341120f2224b652d796942c80a293ecfa9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -164,7 +164,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
   {
     EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -200,12 +200,12 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -433,7 +433,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 0dfe21604290de306e0f618975cb5deb2cabcf7a..5235a8e6f93f9d0571ba1acbe7e7e28990b9e5d4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -119,7 +119,7 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -172,14 +172,14 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType)
   {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  EIGEN_STRONG_INLINE void cleanup()
   {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
@@ -318,7 +318,7 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device)
     : Base(op, device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 424caced1a2a6eefe0bb6bca23304bdfb8518f4b..fa36da195f38fcc8621e55246734dc2bc8c6ddf6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -89,7 +89,6 @@ struct TensorContractionBlockMemAllocator {
     eigen_assert(rhs_block);
     BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
     char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
-    eigen_assert(block_mem);
     *lhs_block = reinterpret_cast<LhsScalar*>(block_mem);
     *rhs_block = reinterpret_cast<RhsScalar*>(block_mem + sz.lhs_size);
     return block_mem;
@@ -417,7 +416,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
 
   typedef DSizes<Index, NumDims> Dimensions;
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  EIGEN_STRONG_INLINE
   TensorContractionEvaluatorBase(const XprType& op, const Device& device)
       : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(),
                           op.lhsExpression(), op.rhsExpression()), device),
@@ -602,7 +601,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
@@ -617,7 +616,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType dest, EvalSubExprsCallback done) {
     m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
       m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
@@ -633,6 +632,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
   }
 #endif  // EIGEN_USE_THREADS
 
+#ifndef TENSOR_CONTRACTION_DISPATCH
 #define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
   if (this->m_lhs_inner_dim_contiguous) {                    \
     if (this->m_rhs_inner_dim_contiguous) {                  \
@@ -663,7 +663,9 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
       }                                                      \
     }                                                        \
   }
+#endif
 
+#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH
 #define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
   if (this->m_lhs_inner_dim_contiguous) {                                    \
     if (this->m_rhs_inner_dim_contiguous) {                                  \
@@ -694,6 +696,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
       }                                                                      \
     }                                                                        \
   }
+#endif
 
   EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
    static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
@@ -908,7 +911,7 @@ struct TensorContractionEvaluatorBase : internal::no_assignment_operator
     kernel.deallocate(this->m_device, packed_mem);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
 
@@ -1005,7 +1008,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   // Could we use NumDimensions here?
   typedef DSizes<Index, NumDims> Dimensions;
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+  TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) { }
 
   template <int Alignment>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
index bb990b3782c33b2eea4870876ccda8529e8ff848..bf9194d88b76c80ec29771a3a57e2aed7f030245 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -233,7 +233,7 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
     }                                                           \
   }                                                             \
 
-#define writeRegToShmem(_)                      \
+#define writeRegToShmem()                       \
   lhs_shmem[lhs_store_idx_0] = lhs_pf0;         \
   rhs_shmem[rhs_store_idx_0] = rhs_pf0;         \
                                                 \
@@ -1270,7 +1270,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef typename LeftEvaluator::Dimensions LeftDimensions;
   typedef typename RightEvaluator::Dimensions RightDimensions;
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
+  TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device)
   {
     EIGEN_STATIC_ASSERT( (internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
@@ -1278,7 +1278,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   }
 
   // We need to redefine this method to make nvcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
     this->m_leftImpl.evalSubExprsIfNeeded(NULL);
     this->m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
index a6ca1777a160579f431099b782348f2fcdb90259..473c228490f238f1a6c1bb6f0d31b6ae8f93b557 100755
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -1340,10 +1340,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
+  TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
 
   // We need to redefine this method to make nvcc happy
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
     this->m_leftImpl.evalSubExprsIfNeeded(NULL);
     this->m_rightImpl.evalSubExprsIfNeeded(NULL);
     if (!data) {
@@ -1630,7 +1630,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   }
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     this->m_leftImpl.cleanup();
     this->m_rightImpl.cleanup();
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index 44493906d0d81bd011a7663f77bf87047d3ce963..09d2da9a8d2ed240aa1c9bc9ba45bb54d2d7fabd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -195,14 +195,14 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
 };
 
 template <bool SameType, typename Eval, typename EvalPointerType> struct ConversionSubExprEval {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
     impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 };
 
 template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<true, Eval, EvalPointerType> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) {
     return impl.evalSubExprsIfNeeded(data);
   }
 };
@@ -211,8 +211,7 @@ template <typename Eval, typename EvalPointerType> struct ConversionSubExprEval<
 template <bool SameType, typename Eval, typename EvalPointerType,
           typename EvalSubExprsCallback>
 struct ConversionSubExprEvalAsync {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(
-      Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
     impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
   }
 };
@@ -221,8 +220,7 @@ template <typename Eval, typename EvalPointerType,
           typename EvalSubExprsCallback>
 struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType,
                                   EvalSubExprsCallback> {
-  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(
-      Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
     impl.evalSubExprsIfNeededAsync(data, std::move(done));
   }
 };
@@ -363,21 +361,21 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device)
   {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data)
   {
     return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
   }
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType data, EvalSubExprsCallback done) {
     ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>,
                                EvaluatorPointerType,
@@ -385,7 +383,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   }
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup()
+  EIGEN_STRONG_INLINE void cleanup()
   {
     m_impl.cleanup();
   }
@@ -404,8 +402,8 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
     const bool Vectorizable =
         IsSameType
         ? TensorEvaluator<ArgType, Device>::PacketAccess
-        : TensorEvaluator<ArgType, Device>::PacketAccess &
-          internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
+        : int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+          int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast);
 
     return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode,
                                 Vectorizable, IsSameType>::run(m_impl, index);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index df289e2c0ad90e7e063a431065141708532e60f4..b20f80ba2a07166413e4e05d90c9995086dd34f1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -307,8 +307,8 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef typename Storage::Type EvaluatorPointerType;
 
   enum {
-    IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned,
-    PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess,
+    IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
     BlockAccess = false,
     PreferBlockAccess = false,
     Layout = TensorEvaluator<InputArgType, Device>::Layout,
@@ -320,7 +320,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -384,12 +384,12 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     preloadKernel();
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_inputImpl.cleanup();
     if (m_local_kernel) {
       m_device.deallocate((void*)m_kernel);
@@ -797,7 +797,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device)
+  TensorEvaluator(const XprType& op, const GpuDevice& device)
       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
index 92003c76653130fcfbd015eceeb161570401cdc5..033318fdcc86e1e492f1c3277147f9de919e681e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -305,7 +305,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
+  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
       : m_inputImpl(op.inputExpression(), device),
         m_kernelArg(op.kernelExpression()),
         m_kernelImpl(op.kernelExpression(), device),
@@ -334,7 +334,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
 
   EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     preloadKernel();
     m_inputImpl.evalSubExprsIfNeeded(NULL);
     if (data) {
@@ -348,7 +348,7 @@ struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelAr
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_inputImpl.cleanup();
     if (m_buf) {
       m_device.deallocate_temp(m_buf);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
index 476b2282a1106be37f78c78421caf93d618d9f70..95a8a84ee410bf852d059381b3b1c0dbcb8140c1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
     m_dimensions = op.func().dimensions(op.expression());
@@ -114,7 +114,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (data) {
       evalTo(data);
       return false;
@@ -126,7 +126,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_result) {
       m_device.deallocate_temp(m_result);
       m_result = NULL;
@@ -157,7 +157,7 @@ struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Devi
 #endif
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) {
+  void evalTo(EvaluatorPointerType data) {
     TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
     m_op.func().eval(m_op.expression(), result, m_device);
   }
@@ -279,7 +279,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_op(op), m_device(device), m_result(NULL)
   {
     m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
@@ -287,7 +287,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (data) {
       evalTo(data);
       return false;
@@ -299,7 +299,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_result != NULL) {
       m_device.deallocate_temp(m_result);
       m_result = NULL;
@@ -330,7 +330,7 @@ struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType,
 #endif
 
  protected:
-  EIGEN_DEVICE_FUNC void evalTo(EvaluatorPointerType data) {
+  void evalTo(EvaluatorPointerType data) {
     TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
     m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
index 9422dcd7af7820b6012ca2a4bd65b92d2828d7ee..ec2e3cb143aaaea2b2ef996109cb8cdf25f902c1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -42,51 +42,84 @@ class StreamInterface {
   virtual unsigned int* semaphore() const = 0;
 };
 
-static gpuDeviceProp_t* m_deviceProperties;
-static bool m_devicePropInitialized = false;
-
-static void initializeDeviceProp() {
-  if (!m_devicePropInitialized) {
-    // Attempts to ensure proper behavior in the case of multiple threads
-    // calling this function simultaneously. This would be trivial to
-    // implement if we could use std::mutex, but unfortunately mutex don't
-    // compile with nvcc, so we resort to atomics and thread fences instead.
-    // Note that if the caller uses a compiler that doesn't support c++11 we
-    // can't ensure that the initialization is thread safe.
-    static std::atomic<bool> first(true);
-    if (first.exchange(false)) {
-      // We're the first thread to reach this point.
-      int num_devices;
-      gpuError_t status = gpuGetDeviceCount(&num_devices);
-      if (status != gpuSuccess) {
-        std::cerr << "Failed to get the number of GPU devices: "
-                  << gpuGetErrorString(status)
-                  << std::endl;
-        gpu_assert(status == gpuSuccess);
-      }
-      m_deviceProperties = new gpuDeviceProp_t[num_devices];
-      for (int i = 0; i < num_devices; ++i) {
-        status = gpuGetDeviceProperties(&m_deviceProperties[i], i);
+class GpuDeviceProperties {
+ public:
+  GpuDeviceProperties() : 
+      initialized_(false), first_(true), device_properties_(nullptr) {}
+ 
+  ~GpuDeviceProperties() {
+    if (device_properties_) {
+      delete[] device_properties_;
+    }
+  }
+  
+  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const {
+    return device_properties_[device];
+  }
+
+  EIGEN_STRONG_INLINE bool isInitialized() const {
+    return initialized_;
+  }
+
+  void initialize() {
+    if (!initialized_) {
+      // Attempts to ensure proper behavior in the case of multiple threads
+      // calling this function simultaneously. This would be trivial to
+      // implement if we could use std::mutex, but unfortunately mutex don't
+      // compile with nvcc, so we resort to atomics and thread fences instead.
+      // Note that if the caller uses a compiler that doesn't support c++11 we
+      // can't ensure that the initialization is thread safe.
+      if (first_.exchange(false)) {
+        // We're the first thread to reach this point.
+        int num_devices;
+        gpuError_t status = gpuGetDeviceCount(&num_devices);
         if (status != gpuSuccess) {
-          std::cerr << "Failed to initialize GPU device #"
-                    << i
-                    << ": "
+          std::cerr << "Failed to get the number of GPU devices: "
                     << gpuGetErrorString(status)
                     << std::endl;
           gpu_assert(status == gpuSuccess);
         }
-      }
+        device_properties_ = new gpuDeviceProp_t[num_devices];
+        for (int i = 0; i < num_devices; ++i) {
+          status = gpuGetDeviceProperties(&device_properties_[i], i);
+          if (status != gpuSuccess) {
+            std::cerr << "Failed to initialize GPU device #"
+                      << i
+                      << ": "
+                      << gpuGetErrorString(status)
+                      << std::endl;
+            gpu_assert(status == gpuSuccess);
+          }
+        }
 
-      std::atomic_thread_fence(std::memory_order_release);
-      m_devicePropInitialized = true;
-    } else {
-      // Wait for the other thread to inititialize the properties.
-      while (!m_devicePropInitialized) {
-        std::atomic_thread_fence(std::memory_order_acquire);
-        std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        std::atomic_thread_fence(std::memory_order_release);
+        initialized_ = true;
+      } else {
+        // Wait for the other thread to inititialize the properties.
+        while (!initialized_) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
       }
     }
   }
+
+ private:
+  volatile bool initialized_;
+  std::atomic<bool> first_;
+  gpuDeviceProp_t* device_properties_;
+};
+
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
+  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
+  if (!deviceProperties->isInitialized()) {
+    deviceProperties->initialize();
+  }
+  return *deviceProperties;
+}
+
+EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
+  return GetGpuDeviceProperties().get(device);
 }
 
 static const gpuStream_t default_stream = gpuStreamDefault;
@@ -96,12 +129,9 @@ class GpuStreamDevice : public StreamInterface {
   // Use the default stream on the current device
   GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
     gpuGetDevice(&device_);
-    initializeDeviceProp();
   }
   // Use the default stream on the specified device
-  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
-    initializeDeviceProp();
-  }
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
   // Use the specified stream. Note that it's the
   // caller responsibility to ensure that the stream can run on
   // the specified device. If no device is specified the code
@@ -118,7 +148,6 @@ class GpuStreamDevice : public StreamInterface {
       gpu_assert(device < num_devices);
       device_ = device;
     }
-    initializeDeviceProp();
   }
 
   virtual ~GpuStreamDevice() {
@@ -129,7 +158,7 @@ class GpuStreamDevice : public StreamInterface {
 
   const gpuStream_t& stream() const { return *stream_; }
   const gpuDeviceProp_t& deviceProperties() const {
-    return m_deviceProperties[device_];
+    return GetGpuDeviceProperties(device_);
   }
   virtual void* allocate(size_t num_bytes) const {
     gpuError_t err = gpuSetDevice(device_);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 4689b0230911f6e4bdabe56ecfa1d48435cd7ec4..a48d035f5aad48f0560e613b8d361f25d1075e30 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -131,17 +131,17 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
       TensorBlockAssignment;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()){}
 
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~TensorEvaluator() {
+  EIGEN_STRONG_INLINE ~TensorEvaluator() {
   }
 
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
     EIGEN_UNUSED_VARIABLE(scalar);
     eigen_assert(scalar == NULL);
     return m_impl.evalSubExprsIfNeeded(m_buffer);
@@ -149,7 +149,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType scalar, EvalSubExprsCallback done) {
     EIGEN_UNUSED_VARIABLE(scalar);
     eigen_assert(scalar == NULL);
@@ -191,7 +191,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device>
     block.cleanup();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index d4532b72c82d6771bf2f3c034076582bb60196c0..6ac575ef59fd9adb39f4b4dec9d5e5cd2d183b34 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -63,7 +63,7 @@ struct TensorEvaluator
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
       : m_data(device.get((const_cast<TensorPointerType>(m.data())))),
         m_dims(m.dimensions()),
         m_device(device)
@@ -72,7 +72,7 @@ struct TensorEvaluator
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
     if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && dest) {
       m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
       return false;
@@ -82,14 +82,14 @@ struct TensorEvaluator
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType dest, EvalSubExprsCallback done) {
     // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
     done(evalSubExprsIfNeeded(dest));
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {}
+  EIGEN_STRONG_INLINE void cleanup() {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     eigen_assert(m_data != NULL);
@@ -192,7 +192,7 @@ struct TensorEvaluator
   const Device EIGEN_DEVICE_REF m_device;
 };
 
-namespace {
+namespace internal {
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 T loadConstant(const T* address) {
   return *address;
@@ -219,8 +219,7 @@ T &loadConstant(const Eigen::TensorSycl::internal::RangeAccess<AcMd, T> &address
   return *address;
 }
 #endif
-}
-
+}  // namespace internal
 
 // Default evaluator for rvalues
 template<typename Derived, typename Device>
@@ -262,13 +261,13 @@ struct TensorEvaluator<const Derived, Device>
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
       : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization && data) {
       m_device.memcpy((void*)(m_device.get(data)),m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
       return false;
@@ -278,18 +277,18 @@ struct TensorEvaluator<const Derived, Device>
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType dest, EvalSubExprsCallback done) {
     // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
     done(evalSubExprsIfNeeded(dest));
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     eigen_assert(m_data != NULL);
-    return loadConstant(m_data+index);
+    return internal::loadConstant(m_data+index);
   }
 
   template<int LoadMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
@@ -314,7 +313,7 @@ struct TensorEvaluator<const Derived, Device>
     eigen_assert(m_data != NULL);
     const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
                         : m_dims.IndexOfRowMajor(coords);
-    return loadConstant(m_data+index);
+    return internal::loadConstant(m_data+index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
@@ -357,7 +356,6 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 {
   typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
 
-  EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
       : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
   { }
@@ -391,17 +389,17 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     done(true);
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
@@ -446,8 +444,8 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
 
   enum {
     IsAligned          = TensorEvaluator<ArgType, Device>::IsAligned,
-    PacketAccess       = TensorEvaluator<ArgType, Device>::PacketAccess &
-                         internal::functor_traits<UnaryOp>::PacketAccess,
+    PacketAccess       = int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+                         int(internal::functor_traits<UnaryOp>::PacketAccess),
     BlockAccess        = TensorEvaluator<ArgType, Device>::BlockAccess,
     PreferBlockAccess  = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
     Layout             = TensorEvaluator<ArgType, Device>::Layout,
@@ -455,7 +453,7 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
     RawAccess          = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
     : m_device(device),
       m_functor(op.functor()),
       m_argImpl(op.nestedExpression(), device)
@@ -485,20 +483,20 @@ struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_argImpl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_argImpl.cleanup();
   }
 
@@ -557,21 +555,21 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
 
   enum {
-    IsAligned         = TensorEvaluator<LeftArgType, Device>::IsAligned &
-                        TensorEvaluator<RightArgType, Device>::IsAligned,
-    PacketAccess      = TensorEvaluator<LeftArgType, Device>::PacketAccess &
-                        TensorEvaluator<RightArgType, Device>::PacketAccess &
-                        internal::functor_traits<BinaryOp>::PacketAccess,
-    BlockAccess       = TensorEvaluator<LeftArgType, Device>::BlockAccess &
-                        TensorEvaluator<RightArgType, Device>::BlockAccess,
-    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess |
-                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    IsAligned         = int(TensorEvaluator<LeftArgType, Device>::IsAligned) &
+                        int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess      = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::PacketAccess) &
+                        int(internal::functor_traits<BinaryOp>::PacketAccess),
+    BlockAccess       = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                        int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
     Layout            = TensorEvaluator<LeftArgType, Device>::Layout,
     CoordAccess       = false,  // to be implemented
     RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
     : m_device(device),
       m_functor(op.functor()),
       m_leftImpl(op.lhsExpression(), device),
@@ -613,7 +611,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
     return m_leftImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_leftImpl.evalSubExprsIfNeeded(NULL);
     m_rightImpl.evalSubExprsIfNeeded(NULL);
     return true;
@@ -621,7 +619,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     // TODO(ezhulenev): Evaluate two expression in parallel?
     m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
@@ -631,7 +629,7 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_leftImpl.cleanup();
     m_rightImpl.cleanup();
   }
@@ -709,7 +707,7 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
     RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
     : m_functor(op.functor()),
       m_arg1Impl(op.arg1Expression(), device),
       m_arg2Impl(op.arg2Expression(), device),
@@ -752,13 +750,13 @@ struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type,
     return m_arg1Impl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_arg1Impl.evalSubExprsIfNeeded(NULL);
     m_arg2Impl.evalSubExprsIfNeeded(NULL);
     m_arg3Impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_arg1Impl.cleanup();
     m_arg2Impl.cleanup();
     m_arg3Impl.cleanup();
@@ -829,7 +827,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     RawAccess         = false
   };
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
     : m_condImpl(op.ifExpression(), device),
       m_thenImpl(op.thenExpression(), device),
       m_elseImpl(op.elseExpression(), device)
@@ -886,7 +884,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
     return m_condImpl.dimensions();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_condImpl.evalSubExprsIfNeeded(NULL);
     m_thenImpl.evalSubExprsIfNeeded(NULL);
     m_elseImpl.evalSubExprsIfNeeded(NULL);
@@ -895,7 +893,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
       m_thenImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
@@ -905,7 +903,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_condImpl.cleanup();
     m_thenImpl.cleanup();
     m_elseImpl.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index c62bc5fa90e3c4bb133ad675bc6a54ece797c9a2..4a1a0687cabbb915d3987bedcfd94149e296005d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -144,7 +144,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
     for (int i = 0; i < NumDims; ++i) {
       eigen_assert(input_dims[i] > 0);
@@ -169,7 +169,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     return m_dimensions;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
     if (data) {
       evalToBuf(data);
@@ -181,7 +181,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
     }
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_data) {
       m_device.deallocate(m_data);
       m_data = NULL;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index ca39bb855217e287abeacbf0333c671441bb8046..c7c1cfc7228cc334558b81485820bf6ac714b7f0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -61,7 +61,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
   public:
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    rank()                   const { return NumIndices; }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions&        dimensions()             const { return m_storage.dimensions(); }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions         dimensions()             const { return m_storage.dimensions(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index                    size()                   const { return m_storage.size(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar                   *data()                        { return m_storage.data(); }
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar             *data()                  const { return m_storage.data(); }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 14020aa68ecacadc3ff4a0511f6ee80ce211f9fb..e800dedc6cf062b97224bde02475f9ab49dd5021 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -135,16 +135,13 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+  TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_op(op.expression()),
       m_device(device), m_buffer(NULL)
   { }
 
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
-  #if !defined(EIGEN_HIPCC)
-  EIGEN_DEVICE_FUNC
-  #endif
   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     const Index numValues =  internal::array_prod(m_impl.dimensions());
     m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(numValues * sizeof(CoeffReturnType)));
@@ -165,7 +162,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     const Index numValues = internal::array_prod(m_impl.dimensions());
     m_buffer = m_device.get((CoeffReturnType*)m_device.allocate_temp(
@@ -185,7 +182,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device>
   }
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_device.deallocate_temp(m_buffer);
     m_buffer = NULL;
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index fd8fa00fa6292be32273241155e035858a666c9f..d9630322435fdd2cd75cc4e67ccf97a44088404d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -365,12 +365,16 @@ struct reducer_traits<OrReducer, Device> {
   };
 };
 
-
-// Argmin/Argmax reducers
+// Argmin/Argmax reducers.  Returns the first occurrence if multiple locations
+// contain the same min/max value.
 template <typename T> struct ArgMaxTupleReducer
 {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    if (t.second > accum->second) { *accum = t; }
+    if (t.second < accum->second) {
+      return;
+    } else if (t.second > accum->second || accum->first > t.first ) {
+      *accum = t;
+    }
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
     return T(0, NumTraits<typename T::second_type>::lowest());
@@ -394,7 +398,11 @@ struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
 template <typename T> struct ArgMinTupleReducer
 {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
-    if (t.second < accum->second) { *accum = t; }
+    if (t.second > accum->second) {
+      return;
+    } else if (t.second < accum->second || accum->first > t.first) {
+      *accum = t;
+    }
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
     return T(0, NumTraits<typename T::second_type>::highest());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index b1ff1d8b1b6276d453096b66a7314f0dd1bfbb02..174bf06838c008215a312537a03f3c94870db836 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -111,7 +111,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       :  m_device(device), m_generator(op.generator())
   {
     TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
@@ -136,10 +136,10 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
index db394bcbb6349ab229e8526f43e06d3d5931a550..1d142f2ee8790c87590ca235c7fecb5592366da1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaUndefines.h
@@ -10,6 +10,8 @@
 
 #if defined(EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H)
 
+#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
 #undef gpuStream_t
 #undef gpuDeviceProp_t 
 #undef gpuError_t
@@ -35,6 +37,8 @@
 #undef gpuDeviceSynchronize
 #undef gpuMemcpy
 
+#endif // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
 #undef EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
 
 #endif // EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index 49d1004f3d82d80d597dbdca2ba6ba5ffd57b758..dd51850b7760f42713ad5a7ac3b856362d5ac5e7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -242,7 +242,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator( const XprType& op, const Device& device)
       : m_device(device), m_impl(op.expression(), device)
   {
     EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -389,20 +389,20 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -514,16 +514,16 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   }
 #endif
 
-  Index rowPaddingTop() const { return m_rowPaddingTop; }
-  Index colPaddingLeft() const { return m_colPaddingLeft; }
-  Index outputRows() const { return m_outputRows; }
-  Index outputCols() const { return m_outputCols; }
-  Index userRowStride() const { return m_row_strides; }
-  Index userColStride() const { return m_col_strides; }
-  Index userInRowStride() const { return m_in_row_strides; }
-  Index userInColStride() const { return m_in_col_strides; }
-  Index rowInflateStride() const { return m_row_inflate_strides; }
-  Index colInflateStride() const { return m_col_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
   costPerCoeff(bool vectorized) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 2d8c7b9033f22a48b3e7192c31216396fa270dd0..d3600eab35458acf586e7beecc1d66b87e25d744 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -246,7 +246,7 @@ struct tuple_coeff {
 
   template <typename... T>
   EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
-    return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
+    return ((i == Idx) && is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
         tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
   }
 
@@ -468,7 +468,7 @@ struct index_statically_eq_impl {
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
 };
@@ -476,7 +476,7 @@ struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...> > {
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexList<FirstType, OtherTypes...>().get(i) == value);
   }
 };
@@ -492,7 +492,7 @@ struct index_statically_ne_impl {
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
 };
@@ -500,7 +500,7 @@ struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...> > {
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexList<FirstType, OtherTypes...>().get(i) != value);
   }
 };
@@ -516,7 +516,7 @@ struct index_statically_gt_impl {
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
 };
@@ -524,7 +524,7 @@ struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...> > {
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexList<FirstType, OtherTypes...>().get(i) > value);
   }
 };
@@ -541,7 +541,7 @@ struct index_statically_lt_impl {
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexList<FirstType, OtherTypes...>().get(i) < value);
   }
 };
@@ -549,7 +549,7 @@ struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...> > {
 template <typename FirstType, typename... OtherTypes>
 struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexList<FirstType, OtherTypes...>().get(i) < value);
   }
 };
@@ -566,7 +566,7 @@ struct index_pair_first_statically_eq_impl {
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
 };
@@ -574,7 +574,7 @@ struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes..
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
   }
 };
@@ -591,7 +591,7 @@ struct index_pair_second_statically_eq_impl {
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
 };
@@ -599,7 +599,7 @@ struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes.
 template <typename FirstType, typename... OtherTypes>
 struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
-    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
         (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
   }
 };
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index 7dadec7fbe7ec11814cb38d71be80d1e28db1efe..c5cb61af53468fd38def4498dab4cab50ae07488 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -103,7 +103,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_strides(op.strides())
   {
     m_dimensions = m_impl.dimensions();
@@ -137,11 +137,11 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 6d5cce4aa6fe65c7507309e3278da65a04ec93a1..74fdc4c3c7722b2ad5802b1ca62b4d2922a55e80 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -28,8 +28,6 @@ namespace Eigen {
 
 namespace internal {
 
-namespace {
-
   // Note: result is undefined if val == 0
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
@@ -135,8 +133,6 @@ namespace {
 #endif
     }
   };
-}
-
 
 template <typename T, bool div_gt_one = false>
 struct TensorIntDivisor {
@@ -252,7 +248,7 @@ private:
 
 
 template <typename T, bool div_gt_one>
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator / (const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
   return divisor.divide(numerator);
 }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
index f159db1b9612c6b9bebee89b42316b5e4745eea6..80106c1a0093dcfaf8b98f8e67ee4738c4c203da 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -113,7 +113,7 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     for(int i = 0; i < NumDims; ++i) {
@@ -136,10 +136,10 @@ struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -191,7 +191,7 @@ template<typename ArgType, typename Device>
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index a6181d35e6834f5d1010e2450d5eb04c865a2e3a..75b919839e4a401a1e07fb53df2a2a0da12a1b5a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -52,7 +52,7 @@ struct PacketType : internal::packet_traits<Scalar> {
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
 
 typedef ulonglong2 Packet4h2;
 template<>
@@ -261,10 +261,10 @@ template <typename Idx> struct IndexPair {
 #ifdef EIGEN_HAS_SFINAE
 namespace internal {
 
-  template<typename IndexType, typename Index, Index... Is>
+  template<typename IndexType, typename Index, Index First, Index... Is>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-  array<Index, sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, Is...>) {
-    return { idx[Is]... };
+  array<Index, 1 + sizeof...(Is)> customIndices2Array(IndexType& idx, numeric_list<Index, First, Is...>) {
+    return { static_cast<Index>(idx[First]), static_cast<Index>(idx[Is])... };
   }
   template<typename IndexType, typename Index>
   EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index ef79c8567e2fb4d4e1360297ce2d0c17b647f244..57da2e18d7a443d3d665b2bfd555d929b899e32c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -142,7 +142,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
           TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dimensions(op.dimensions())
   {
     // The total size of the reshaped tensor must be equal to the total size
@@ -154,16 +154,16 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType data, EvalSubExprsCallback done) {
     m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
   }
 #endif
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     return m_impl.evalSubExprsIfNeeded(data);
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -255,7 +255,7 @@ template<typename NewDimensions, typename ArgType, typename Device>
     RawAccess         = TensorEvaluator<ArgType, Device>::RawAccess
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
   { }
 
@@ -369,8 +369,9 @@ class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, X
 };
 
 
+namespace internal {
+
 // Fixme: figure out the exact threshold
-namespace {
 template <typename Index, typename Device, bool BlockAccess> struct MemcpyTriggerForSlicing {
   EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) { }
   EIGEN_DEVICE_FUNC bool operator ()(Index total, Index contiguous) const {
@@ -400,7 +401,7 @@ template <typename Index, bool BlockAccess> struct MemcpyTriggerForSlicing<Index
 };
 #endif
 
-}
+}  // namespace internal
 
 // Eval as rvalue
 template<typename StartIndices, typename Sizes, typename ArgType, typename Device>
@@ -443,15 +444,10 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices())
   {
-    for (Index i = 0; i < internal::array_size<Dimensions>::value; ++i) {
-      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
-    }
-
     m_is_identity = true;
-    bool degenerate = false;
     for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
       eigen_assert(m_impl.dimensions()[i] >=
                    op.sizes()[i] + op.startIndices()[i]);
@@ -459,9 +455,6 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
           op.startIndices()[i] != 0) {
         m_is_identity = false;
       }
-      if (op.sizes()[i] == 0) { // we have an empty size
-        degenerate = true;
-      }
     }
 
     // No strides for scalars.
@@ -479,8 +472,8 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);      }
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
     } else {
       m_inputStrides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
@@ -491,14 +484,14 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
       m_outputStrides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);      }
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
     if (!NumTraits<typename internal::remove_const<Scalar>::type>::RequireInitialization
         && data && m_impl.data()) {
@@ -519,7 +512,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
         }
       }
       // Use memcpy if it's going to be faster than using the regular evaluation.
-      const MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
+      const internal::MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
       if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
         EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
         for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
@@ -534,13 +527,13 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -738,7 +731,7 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
@@ -906,7 +899,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device),
         m_device(device),
         m_strides(op.strides())
@@ -933,14 +926,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
     typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
     const InputDimensions& input_dims = m_impl.dimensions();
 
-    // check for degenerate intervals and compute output tensor shape
-    bool degenerate = false;
+    // compute output tensor shape
     m_is_identity = true;
     for (int i = 0; i < NumDims; i++) {
       Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
       if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) {
         m_dimensions[i] = 0;
-        degenerate = true;
       } else {
         m_dimensions[i] =
             (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0);
@@ -967,8 +958,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
       m_outputStrides[0] = 1;
       for (int i = 1; i < NumDims; ++i) {
         m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     } else {
       m_inputStrides[NumDims-1] = m_strides[NumDims-1];
@@ -983,8 +973,7 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
       m_outputStrides[NumDims-1] = 1;
       for (int i = NumDims - 2; i >= 0; --i) {
         m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
-        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
       }
     }
   }
@@ -992,12 +981,12 @@ struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices,
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -1088,7 +1077,7 @@ struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Stride
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : Base(op, device)
     { }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 561666c6fe500104514725f5ee06c356c3c57f44..ee44382cffacfcaf0da049991252fe6f3353ecc8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -116,7 +116,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device)
   {
     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
@@ -151,20 +151,20 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index 64a436e50740296e22b5592a7104c00ad983c0a9..413d25dd4da57c6ee7c313e13d7ea92a82bd168d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -107,7 +107,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     Index num_patches = 1;
@@ -152,12 +152,12 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
index 13450e1a795ee35a65f38f627eee18bd62f81dbd..2bcb39a950a165cd5c499a9a32a8258908592aac 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -14,58 +14,20 @@
 namespace Eigen {
 namespace internal {
 
-namespace {
-
-EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() {
 #if defined(EIGEN_GPU_COMPILE_PHASE)
   // We don't support 3d kernels since we currently only use 1 and
   // 2d kernels.
   gpu_assert(threadIdx.z == 0);
-  return clock64() +
-      blockIdx.x * blockDim.x + threadIdx.x +
-      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
-
-#elif defined _WIN32
-  // Use the current time as a baseline.
-  SYSTEMTIME st;
-  GetSystemTime(&st);
-  int time = st.wSecond + 1000 * st.wMilliseconds;
-  // Mix in a random number to make sure that we get different seeds if
-  // we try to generate seeds faster than the clock resolution.
-  // We need 2 random values since the generator only generate 16 bits at
-  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
-  int rnd1 = ::rand();
-  int rnd2 = ::rand();
-  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
-  return rnd;
-
-#elif defined __APPLE__
-  // Same approach as for win32, except that the random number generator
-  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
-  uint64_t rnd = ::random() ^ mach_absolute_time();
-  return rnd;
-
-#elif defined __native_client__
-  // Same approach as for win32, except using clock_gettime
-  timespec ts;
-  clock_gettime(CLOCK_REALTIME, &ts);
-  int rnd1 = ::rand();
-  int rnd2 = ::rand();
-  uint64_t rnd = (rnd1 | rnd2 << 16) ^ ts.tv_nsec;
-  return rnd;
-
+  return blockIdx.x * blockDim.x + threadIdx.x 
+         + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
 #else
-  // Augment the current time with pseudo random number generation
-  // to ensure that we get different seeds if we try to generate seeds
-  // faster than the clock resolution.
-  timespec ts;
-  clock_gettime(CLOCK_REALTIME, &ts);
-  uint64_t rnd = ::random() ^ ts.tv_nsec;
-  return rnd;
+  // Rely on Eigen's random implementation.
+  return random<uint64_t>();
 #endif
 }
 
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
   // TODO: Unify with the implementation in the non blocking thread pool.
   uint64_t current = *state;
   // Update the internal state
@@ -74,14 +36,11 @@ static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint6
   return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
 }
 
-static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
   seed = seed ? seed : get_random_seed();
   return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
 }
 
-}  // namespace
-
-
 template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
 T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
   unsigned rnd = PCG_XSH_RS_generator(state, stream);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 0a65591e63e261ea19d797fc057b1390b95767a2..f1f4eaab730e91260da27b50309b34bc71b4a0c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -166,8 +166,12 @@ struct GenericDimReducer<-1, Self, Op> {
 };
 
 template <typename Self, typename Op, bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
-          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
-                                   !Self::ReducerTraits::IsExactlyAssociative)>
+    bool UseTreeReduction = (!Self::ReducerTraits::IsStateful &&
+                             !Self::ReducerTraits::IsExactlyAssociative &&
+                             // GPU threads can quickly run out of stack space
+                             // for moderately sized inputs.
+                             !Self::RunningOnGPU
+                             )>
 struct InnerMostDimReducer {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
     typename Self::CoeffReturnType accum = reducer.initialize();
@@ -528,6 +532,18 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
     // Subset of strides of the input tensor for the non-reduced dimensions.
   // Indexed by output dimensions.
   static const int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+  
+  // For full reductions
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+  static constexpr bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
+  static constexpr bool RunningOnSycl = false;
+#elif defined(EIGEN_USE_SYCL)
+static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
+static const bool RunningOnGPU = false;
+#else
+  static constexpr bool RunningOnGPU = false;
+  static constexpr bool RunningOnSycl = false;
+#endif
 
   enum {
     IsAligned = false,
@@ -549,7 +565,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
   static const bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
   static const bool RunningFullReduction = (NumOutputDims==0);
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
   {
     EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -578,7 +594,7 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
           m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
         }
       } else {
-        m_outputStrides[NumOutputDims - 1] = 1;
+        m_outputStrides[static_cast<size_t>(NumOutputDims - 1)] = 1;
         for (int i = NumOutputDims - 2; i >= 0; --i) {
           m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
           m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
@@ -631,13 +647,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
   EIGEN_STRONG_INLINE
-#if !defined(EIGEN_HIPCC)
-  // Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same
-  // for all the functions being called within here, which then leads to
-  // proliferation of EIGEN_DEVICE_FUNC markings, one of which will eventually
-  // result in an NVCC error
-  EIGEN_DEVICE_FUNC
-#endif
   bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
     // Use the FullReducer if possible.
     if ((RunningFullReduction && RunningOnSycl) ||(RunningFullReduction &&
@@ -746,9 +755,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
   EIGEN_STRONG_INLINE
-#if !defined(EIGEN_HIPCC)
-      EIGEN_DEVICE_FUNC
-#endif
       void
       evalSubExprsIfNeededAsync(EvaluatorPointerType data,
                                 EvalSubExprsCallback done) {
@@ -759,19 +765,12 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
 #endif
 
   EIGEN_STRONG_INLINE
-#if !defined(EIGEN_HIPCC)
-  // Marking this as EIGEN_DEVICE_FUNC for HIPCC requires also doing the same
-  // for all the functions being called within here, which then leads to
-  // proliferation of EIGEN_DEVICE_FUNC markings, one of which will eventually
-  // result in an NVCC error
-  EIGEN_DEVICE_FUNC
-#endif
   bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return evalSubExprsIfNeededCommon(data);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
     if (m_result) {
       m_device.deallocate_temp(m_result);
@@ -967,17 +966,6 @@ struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, M
   // Operation to apply for computing the reduction.
   Op m_reducer;
 
-  // For full reductions
-#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
-  static const bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
-  static const bool RunningOnSycl = false;
-#elif defined(EIGEN_USE_SYCL)
-static const bool RunningOnSycl = internal::is_same<typename internal::remove_all<Device>::type, Eigen::SyclDevice>::value;
-static const bool RunningOnGPU = false;
-#else
-  static const bool RunningOnGPU = false;
-  static const bool RunningOnSycl = false;
-#endif
   EvaluatorPointerType m_result;
 
   const Device EIGEN_DEVICE_REF m_device;
@@ -987,7 +975,7 @@ template<typename Op, typename Dims, typename ArgType, template <class> class Ma
 struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
   typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device){}
 };
 
 
@@ -996,7 +984,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>,
 : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
 
   typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> Base;
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device) : Base(op, device){}
   // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL kernel
   //Therefore the coeff function should be overridden by for SYCL kernel
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index db4e8d866f04ce0586835da2595a171856ae96dd..315ccc172e9aded42fdf4b031054bb47f90ee9e7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -98,6 +98,7 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
     }
   }
 }
+#ifdef EIGEN_GPU_COMPILE_PHASE
 // reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
 template <typename R>
 __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
@@ -107,6 +108,7 @@ __device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reduc
     atomicReduce(houtput+i,*(haccum+i),reducer);
   }
 }
+#endif  // EIGEN_GPU_COMPILE_PHASE
 #endif  // EIGEN_HAS_GPU_FP16
 
 template <>
@@ -213,8 +215,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer
 #ifdef EIGEN_HAS_GPU_FP16
 template <typename Self,
           typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                                      packet_traits<Eigen::half>::type* scratch) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
+    Reducer reducer, const Self input, Index num_coeffs, half* scratch) {
   eigen_assert(blockDim.x == 1);
   eigen_assert(gridDim.x == 1);
   typedef packet_traits<Eigen::half>::type packet_type;
@@ -224,15 +226,16 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFlo
     half2* h2scratch = reinterpret_cast<half2*>(scratch);
     for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
       *h2scratch =
-          __halves2half2(input.m_impl.coeff(i), input.m_impl.coeff(i + 1));
+          __halves2half2(input.coeff(i), input.coeff(i + 1));
       h2scratch++;
     }
     if ((num_coeffs & 1) != 0) {
-      half lastCoeff = input.m_impl.coeff(num_coeffs - 1);
+      half lastCoeff = input.coeff(num_coeffs - 1);
       *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
     }
   } else {
-    *scratch = reducer.template initializePacket<packet_type>();
+    packet_type reduce = reducer.template initializePacket<packet_type>();
+    internal::pstoreu(scratch, reduce);
   }
 }
 
@@ -258,8 +261,9 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reduce
 
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
-                                    half* output, packet_traits<Eigen::half>::type* scratch) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(
+    Reducer reducer, const Self input, Index num_coeffs,
+    half* output, half* scratch) {
   typedef typename packet_traits<Eigen::half>::type PacketType;
   const int packet_width = unpacket_traits<PacketType>::size;
   eigen_assert(NumPerThread % packet_width == 0);
@@ -273,19 +277,20 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
       int rem = num_coeffs % packet_width;
       if (rem != 0) {
         half2* p_scratch = reinterpret_cast<half2*>(scratch);
-        *scratch = reducer.template initializePacket<PacketType>();
+        pstoreu(scratch, reducer.template initializePacket<PacketType>());
         for (int i = 0; i < rem / 2; i++) {
           *p_scratch = __halves2half2(
-              input.m_impl.coeff(num_coeffs - packet_width + 2 * i),
-              input.m_impl.coeff(num_coeffs - packet_width + 2 * i + 1));
+              input.coeff(num_coeffs - packet_width + 2 * i),
+              input.coeff(num_coeffs - packet_width + 2 * i + 1));
           p_scratch++;
         }
         if ((num_coeffs & 1) != 0) {
-          half last = input.m_impl.coeff(num_coeffs - 1);
+          half last = input.coeff(num_coeffs - 1);
           *p_scratch = __halves2half2(last, reducer.initialize());
         }
       } else {
-        *scratch = reducer.template initializePacket<PacketType>();
+        PacketType reduce = reducer.template initializePacket<PacketType>();
+        pstoreu(scratch, reduce);
       }
     }
     __syncthreads();
@@ -298,7 +303,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
   for (Index i = 0; i < max_iter; i += BlockSize) {
     const Index index = first_index + packet_width * i;
     eigen_assert(index + packet_width < num_coeffs);
-    PacketType val = input.m_impl.template packet<Unaligned>(index);
+    PacketType val = input.template packet<Unaligned>(index);
     reducer.reducePacket(val, &accum);
   }
 
@@ -337,7 +342,7 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
   }
 
   if ((threadIdx.x & (warpSize - 1)) == 0) {
-    atomicReduce(scratch, accum, reducer);
+    atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
   }
 
   __syncthreads();
@@ -357,17 +362,21 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reduce
 }
 
 template <typename Op>
-__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
   eigen_assert(threadIdx.x == 1);
-  half2* pscratch = reinterpret_cast<half2*>(scratch);
-  half tmp = __float2half(0.f);
   typedef packet_traits<Eigen::half>::type packet_type;
-  for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
-    reducer.reduce(__low2half(*pscratch), &tmp);
-    reducer.reduce(__high2half(*pscratch), &tmp);
-    pscratch++;
+  if (unpacket_traits<packet_type>::size == 1) {
+    *output = *scratch;
+  } else {
+    half2* pscratch = reinterpret_cast<half2*>(scratch);
+    half tmp = __float2half(0.f);
+    for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
+      reducer.reduce(__low2half(*pscratch), &tmp);
+      reducer.reduce(__high2half(*pscratch), &tmp);
+      pscratch++;
+    }
+    *output = tmp;
   }
-  *output = tmp;
 }
 
 #endif // EIGEN_HAS_GPU_FP16
@@ -416,13 +425,11 @@ template <typename Self, typename Op>
 struct FullReductionLauncher<Self, Op, Eigen::half, true> {
   static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
     typedef typename Self::Index Index;
-    typedef typename packet_traits<Eigen::half>::type PacketType;
 
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
-    PacketType* scratch = static_cast<PacketType*>(device.scratchpad());
-    // half2* scratch = static_cast<half2*>(device.scratchpad());
+    half* scratch = static_cast<half*>(device.scratchpad());
 
     if (num_blocks > 1) {
       // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index 030d19844353e65d83f53cf344ad9bc786919315..a27d3646de15aa5c7656973c57f1c99c717dda07 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -388,17 +388,17 @@ struct TensorEvaluator<const TensorRef<Derived>, Device>
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
+  EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&)
       : m_ref(m)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { }
+  EIGEN_STRONG_INLINE void cleanup() { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
     return m_ref.coeff(index);
@@ -439,7 +439,7 @@ struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<cons
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
+  EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d)
   { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 3b1fca59b79c1a1f882dcf8f8ea17e60fc43dbef..586ce68ab0ae3363127f0b5118ad9dd938195916 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -121,8 +121,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device),
         m_reverse(op.reverse()),
         m_device(device)
@@ -150,20 +149,20 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -426,8 +425,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
     CoordAccess = false,  // to be implemented
     RawAccess = false
   };
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device) {}
 
   typedef typename XprType::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index a06c4a9f31caa5ed7df61209ef22bcefd39bbdbe..beae854ddba66753048e97a4a0037ec56d12c73f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -402,8 +402,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device),
         m_device(device),
         m_exclusive(op.exclusive()),
@@ -498,7 +497,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     if (m_output) {
       m_device.deallocate_temp(m_output);
       m_output = NULL;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index e6fed3d0baee0db0cca670a9b489fdda57b23690..e5e5efdeecad146089fea2fdf3697c030adbdb07 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -118,8 +118,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
       TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
-                                                        const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_device(device),
         m_impl(op.expression(), device)
   {
@@ -143,7 +142,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
         m_unshuffledInputStrides[i] =
             m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
       }
     } else {
       m_unshuffledInputStrides[NumDims - 1] = 1;
@@ -152,7 +152,8 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
         m_unshuffledInputStrides[i] =
             m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
         m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
-        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
+                  m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
       }
     }
 
@@ -163,20 +164,20 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
 #ifdef EIGEN_USE_THREADS
   template <typename EvalSubExprsCallback>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
       EvaluatorPointerType, EvalSubExprsCallback done) {
     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
   }
 #endif  // EIGEN_USE_THREADS
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -384,7 +385,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device)
   { }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 5ff0880e78ef3a48f30b85257176cbde3dea0935..66655c4777f84f3366d495f9beeda269f3346e98 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -55,17 +55,14 @@ class TensorStorage
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE const T *data() const { return m_data; }
 
-  static EIGEN_DEVICE_FUNC
-  EIGEN_STRONG_INLINE const FixedDimensions& dimensions()
-  {
-    static const FixedDimensions* singleton_dimensions = new FixedDimensions();
-    return *singleton_dimensions;
-  }
+  EIGEN_DEVICE_FUNC
+  EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return FixedDimensions(); }
 
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
 };
 
+
 // pure dynamic
 template<typename T, typename IndexType, int NumIndices_, int Options_>
 class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 64bf3f139492347512c0ff08b6b9128b6bb4ce63..2f62a668f4dac17452dc3f0cf6323bfad130e8ca 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -109,7 +109,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
     m_dimensions = m_impl.dimensions();
@@ -142,11 +142,11 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType/*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -277,7 +277,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
     RawAccess = false
   };
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : Base(op, device) { }
 
   typedef typename XprType::Index Index;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
index 24d22c189cacbce7b96532231e5f4b935f36bb5b..bbd2ff33259635bcf3f46bda987dc5ec3cae302d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -108,7 +108,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
     : m_impl(op.expression(), device), m_traceDim(1), m_device(device)
   {
 
@@ -134,6 +134,7 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
       }
     }
 
+    EIGEN_ONLY_USED_FOR_DEBUG(num_distinct_reduce_dims);
     eigen_assert(num_distinct_reduce_dims == NumReducedDims);
 
     // Compute the dimensions of the result.
@@ -211,12 +212,12 @@ struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device>
     return m_dimensions;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
index 4f7fd340ee2492e6d0eb8525439b23a6d39583cd..a8a535a7fc7ac0b75b8238b4a7e87e4e776417a3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -254,10 +254,10 @@ struct nested<const TensorRef<PlainObjectType> >
 // the SAME case.
 // When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
 // Pc=0.
-typedef enum {
+enum PaddingType {
   PADDING_VALID = 1,
   PADDING_SAME = 2
-} PaddingType;
+};
 
 }  // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index d23f2e4c81eabb3e3a68e0fb35c49ac5f03c2a68..afbcba4a27f25b1640196fc72e29b2cda004c525 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -78,14 +78,14 @@ template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
-  return (lhs.high == rhs.high) & (lhs.low == rhs.low);
+  return (lhs.high == rhs.high) && (lhs.low == rhs.low);
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
-  return (lhs.high != rhs.high) | (lhs.low != rhs.low);
+  return (lhs.high != rhs.high) || (lhs.low != rhs.low);
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index 81bed57f389766f049bb5ae582ca097e361022a1..0beb9ff09b832f16006d8a57f3079f3aedb91599 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -194,7 +194,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   typedef internal::TensorBlockNotImplemented TensorBlock;
   //===--------------------------------------------------------------------===//
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) :
  m_impl(op.expression(), device)
   {
     EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
@@ -352,12 +352,12 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
     m_impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+  EIGEN_STRONG_INLINE void cleanup() {
     m_impl.cleanup();
   }
 
@@ -518,21 +518,21 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
 
-  Index planePaddingTop() const { return m_planePaddingTop; }
-  Index rowPaddingTop() const { return m_rowPaddingTop; }
-  Index colPaddingLeft() const { return m_colPaddingLeft; }
-  Index outputPlanes() const { return m_outputPlanes; }
-  Index outputRows() const { return m_outputRows; }
-  Index outputCols() const { return m_outputCols; }
-  Index userPlaneStride() const { return m_plane_strides; }
-  Index userRowStride() const { return m_row_strides; }
-  Index userColStride() const { return m_col_strides; }
-  Index userInPlaneStride() const { return m_in_plane_strides; }
-  Index userInRowStride() const { return m_in_row_strides; }
-  Index userInColStride() const { return m_in_col_strides; }
-  Index planeInflateStride() const { return m_plane_inflate_strides; }
-  Index rowInflateStride() const { return m_row_inflate_strides; }
-  Index colInflateStride() const { return m_col_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
 
 #ifdef EIGEN_USE_SYCL
   // binding placeholder accessors to a command group handler for SYCL
diff --git a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 149ceaff01d99924b3fb6b410b524c66097e37ce..f662dee5bebfc96d34e41e184a512aa5fda1cb38 100644
--- a/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
+++ b/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -81,7 +81,8 @@ template<typename a, typename... as>        struct take<0, type_list<a, as...>>
 template<>                                  struct take<0, type_list<>>         { typedef type_list<> type; };
 
 template<typename T, int n, T a, T... as> struct take<n, numeric_list<T, a, as...>> : concat<numeric_list<T, a>, typename take<n-1, numeric_list<T, as...>>::type> {};
-template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type; };
+// XXX The following breaks in gcc-11, and is invalid anyways.
+// template<typename T, int n>               struct take<n, numeric_list<T>>           { typedef numeric_list<T> type; };
 template<typename T, T a, T... as>        struct take<0, numeric_list<T, a, as...>> { typedef numeric_list<T> type; };
 template<typename T>                      struct take<0, numeric_list<T>>           { typedef numeric_list<T> type; };
 
diff --git a/unsupported/Eigen/FFT b/unsupported/Eigen/FFT
index c8c311a60bef74004ac3daad1d1c661e55c70ec3..72cd52143a6e86dba18f7f8a7a4778a7652f5856 100644
--- a/unsupported/Eigen/FFT
+++ b/unsupported/Eigen/FFT
@@ -162,15 +162,16 @@ class FFT
     typedef typename impl_type::Scalar Scalar;
     typedef typename impl_type::Complex Complex;
 
-    enum Flag {
-      Default=0, // goof proof
-      Unscaled=1,
-      HalfSpectrum=2,
-      // SomeOtherSpeedOptimization=4
-      Speedy=32767
-    };
-
-    FFT( const impl_type & impl=impl_type() , Flag flags=Default ) :m_impl(impl),m_flag(flags) { }
+    typedef int Flag;
+    static const Flag Default = 0;
+    static const Flag Unscaled = 1;
+    static const Flag HalfSpectrum = 2;
+    static const Flag Speedy = 32767;
+
+    FFT(const impl_type& impl = impl_type(), Flag flags = Default) : m_impl(impl), m_flag(flags)
+    {
+        eigen_assert((flags == Default || flags == Unscaled || flags == HalfSpectrum || flags == Speedy) && "invalid flags argument");
+    }
 
     inline
     bool HasFlag(Flag f) const { return (m_flag & (int)f) == f;}
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 0ef159e30041882458e569eab56c42170c18873d..0f166e35f01f2ab4951404693366b191d0ebe71b 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -26,11 +26,11 @@ void make_coherent(const A& a, const B&b)
   make_coherent_impl<A,B>::run(a.const_cast_derived(), b.const_cast_derived());
 }
 
-template<typename _DerType, bool Enable> struct auto_diff_special_op;
+template<typename DerivativeType, bool Enable> struct auto_diff_special_op;
 
 } // end namespace internal
 
-template<typename _DerType> class AutoDiffScalar;
+template<typename DerivativeType> class AutoDiffScalar;
 
 template<typename NewDerType>
 inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
@@ -38,16 +38,16 @@ inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::
 }
 
 /** \class AutoDiffScalar
-  * \brief A scalar type replacement with automatic differentation capability
+  * \brief A scalar type replacement with automatic differentiation capability
   *
-  * \param _DerType the vector type used to store/represent the derivatives. The base scalar type
+  * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type
   *                 as well as the number of derivatives to compute are determined from this type.
   *                 Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf
   *                 if the number of derivatives is not known at compile time, and/or, the number
   *                 of derivatives is large.
-  *                 Note that _DerType can also be a reference (e.g., \c VectorXf&) to wrap a
+  *                 Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a
   *                 existing vector into an AutoDiffScalar.
-  *                 Finally, _DerType can also be any Eigen compatible expression.
+  *                 Finally, DerivativeType can also be any Eigen compatible expression.
   *
   * This class represents a scalar value while tracking its respective derivatives using Eigen's expression
   * template mechanism.
@@ -63,17 +63,17 @@ inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::
   *
   */
 
-template<typename _DerType>
+template<typename DerivativeType>
 class AutoDiffScalar
   : public internal::auto_diff_special_op
-            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                                          typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
+            <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+                                          typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value>
 {
   public:
     typedef internal::auto_diff_special_op
-            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                       typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value> Base;
-    typedef typename internal::remove_all<_DerType>::type DerType;
+            <DerivativeType, !internal::is_same<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar,
+                       typename NumTraits<typename internal::traits<typename internal::remove_all<DerivativeType>::type>::Scalar>::Real>::value> Base;
+    typedef typename internal::remove_all<DerivativeType>::type DerType;
     typedef typename internal::traits<DerType>::Scalar Scalar;
     typedef typename NumTraits<Scalar>::Real Real;
 
@@ -382,16 +382,16 @@ class AutoDiffScalar
 
 namespace internal {
 
-template<typename _DerType>
-struct auto_diff_special_op<_DerType, true>
-//   : auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, true>
+//   : auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
 //                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value>
 {
-  typedef typename remove_all<_DerType>::type DerType;
+  typedef typename remove_all<DerivativeType>::type DerType;
   typedef typename traits<DerType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real Real;
 
-//   typedef auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
+//   typedef auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
 //                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value> Base;
 
 //   using Base::operator+;
@@ -401,8 +401,8 @@ struct auto_diff_special_op<_DerType, true>
 //   using Base::operator*;
 //   using Base::operator*=;
 
-  const AutoDiffScalar<_DerType>& derived() const { return *static_cast<const AutoDiffScalar<_DerType>*>(this); }
-  AutoDiffScalar<_DerType>& derived() { return *static_cast<AutoDiffScalar<_DerType>*>(this); }
+  const AutoDiffScalar<DerivativeType>& derived() const { return *static_cast<const AutoDiffScalar<DerivativeType>*>(this); }
+  AutoDiffScalar<DerivativeType>& derived() { return *static_cast<AutoDiffScalar<DerivativeType>*>(this); }
 
 
   inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
@@ -410,12 +410,12 @@ struct auto_diff_special_op<_DerType, true>
     return AutoDiffScalar<DerType&>(derived().value() + other, derived().derivatives());
   }
 
-  friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<_DerType>& b)
+  friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<DerivativeType>& b)
   {
     return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
   }
 
-  inline AutoDiffScalar<_DerType>& operator+=(const Real& other)
+  inline AutoDiffScalar<DerivativeType>& operator+=(const Real& other)
   {
     derived().value() += other;
     return derived();
@@ -431,22 +431,22 @@ struct auto_diff_special_op<_DerType, true>
   }
 
   friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
-  operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
+  operator*(const Real& other, const AutoDiffScalar<DerivativeType>& a)
   {
     return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
       a.value() * other,
       a.derivatives() * other);
   }
 
-  inline AutoDiffScalar<_DerType>& operator*=(const Scalar& other)
+  inline AutoDiffScalar<DerivativeType>& operator*=(const Scalar& other)
   {
     *this = *this * other;
     return derived();
   }
 };
 
-template<typename _DerType>
-struct auto_diff_special_op<_DerType, false>
+template<typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, false>
 {
   void operator*() const;
   void operator-() const;
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index d7672d7c9739ee2f0688d3ca93b80475c4263175..ce92f5bfd7c749f5d52b61c1973dcee70e63b2c9 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -281,7 +281,7 @@ inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT)
 #endif
   int degree = 3;
   for (; degree <= maxPadeDegree; ++degree)
-    if (normIminusT <= maxNormForPade[degree - 3])
+    if (normIminusT <= static_cast<long double>(maxNormForPade[degree - 3]))
       break;
   return degree;
 }
diff --git a/unsupported/Eigen/src/Polynomials/Companion.h b/unsupported/Eigen/src/Polynomials/Companion.h
index 6ab8f9714b66f2c6b83c75551513748ec7d6c0b6..59a15b098e2e6e655babaa6b5beccb68e4daefda 100644
--- a/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/unsupported/Eigen/src/Polynomials/Companion.h
@@ -20,12 +20,6 @@ namespace internal {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-template <typename T>
-T radix(){ return 2; }
-
-template <typename T>
-T radix2(){ return radix<T>()*radix<T>(); }
-
 template<int Size>
 struct decrement_if_fixed_size
 {
@@ -141,7 +135,10 @@ inline
 bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
     bool& isBalanced, RealScalar& colB, RealScalar& rowB )
 {
-  if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm ){ return true; }
+  if( RealScalar(0) == colNorm || RealScalar(0) == rowNorm 
+      || !(numext::isfinite)(colNorm) || !(numext::isfinite)(rowNorm)){
+    return true;
+  }
   else
   {
     //To find the balancing coefficients, if the radix is 2,
@@ -149,33 +146,41 @@ bool companion<_Scalar,_Deg>::balanced( RealScalar colNorm, RealScalar rowNorm,
     // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$
     // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$
     // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$
-    rowB = rowNorm / radix<RealScalar>();
+    const RealScalar radix = RealScalar(2);
+    const RealScalar radix2 = RealScalar(4);
+    
+    rowB = rowNorm / radix;
     colB = RealScalar(1);
     const RealScalar s = colNorm + rowNorm;
 
-    while (colNorm < rowB)
+    // Find sigma s.t. rowNorm / 2 <= 2^(2*sigma) * colNorm
+    RealScalar scout = colNorm;
+    while (scout < rowB)
     {
-      colB *= radix<RealScalar>();
-      colNorm *= radix2<RealScalar>();
+      colB *= radix;
+      scout *= radix2;
     }
-
-    rowB = rowNorm * radix<RealScalar>();
-
-    while (colNorm >= rowB)
+    
+    // We now have an upper-bound for sigma, try to lower it.
+    // Find sigma s.t. 2^(2*sigma) * colNorm / 2 < rowNorm
+    scout = colNorm * (colB / radix) * colB;  // Avoid overflow.
+    while (scout >= rowNorm)
     {
-      colB /= radix<RealScalar>();
-      colNorm /= radix2<RealScalar>();
+      colB /= radix;
+      scout /= radix2;
     }
 
-    //This line is used to avoid insubstantial balancing
-    if ((rowNorm + colNorm) < RealScalar(0.95) * s * colB)
+    // This line is used to avoid insubstantial balancing.
+    if ((rowNorm + radix * scout) < RealScalar(0.95) * s * colB)
     {
       isBalanced = false;
       rowB = RealScalar(1) / colB;
       return false;
     }
-    else{
-      return true; }
+    else
+    {
+      return true;
+    }
   }
 }
 
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
index f1c260e29450eb6b557e664138b8093d343a6f9e..243ffdd5e33f0af5feaecf7539dfa1f0129a3af4 100644
--- a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -301,12 +301,9 @@ struct digamma_impl {
     This implementation works on both scalars and Ts.
 */
 template <typename T>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) {
-  // Clamp the inputs to the range [-4, 4] since anything outside
-  // this range is +/-1.0f in single-precision.
-  const T plus_4 = pset1<T>(4.f);
-  const T minus_4 = pset1<T>(-4.f);
-  const T x = pmax(pmin(a_x, plus_4), minus_4);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& x) {
+  const float kErfInvOneMinusHalfULP = 3.832506856900711f;
+  const T clamp = pcmp_le(pset1<T>(kErfInvOneMinusHalfULP), pabs(x));
   // The monomial coefficients of the numerator polynomial (odd).
   const T alpha_1 = pset1<T>(-1.60960333262415e-02f);
   const T alpha_3 = pset1<T>(-2.95459980854025e-03f);
@@ -342,7 +339,8 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf_float(const T& a_x) {
   q = pmadd(x2, q, beta_0);
 
   // Divide the numerator by the denominator.
-  return pdiv(p, q);
+  const T sign = pselect(pcmp_le(x, pset1<T>(0.0f)), pset1<T>(-1.0f), pset1<T>(1.0f));
+  return pselect(clamp, sign, pdiv(p, q));
 }
 
 template <typename T>
@@ -473,9 +471,9 @@ struct erfc_impl<double> {
  * ERROR MESSAGES:
  *
  *   message         condition    value returned
- * ndtri domain       x <= 0        -MAXNUM
- * ndtri domain       x >= 1         MAXNUM
- *
+ * ndtri domain       x == 0        -INF
+ * ndtri domain       x == 1         INF
+ * ndtri domain       x < 0, x > 1   NAN
  */
  /*
    Cephes Math Library Release 2.2: June, 1992
@@ -637,8 +635,8 @@ T generic_ndtri(const T& a) {
       generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
 
   return pselect(
-      pcmp_le(a, zero), neg_maxnum,
-      pselect(pcmp_le(one, a), maxnum, ndtri));
+      pcmp_eq(a, zero), neg_maxnum,
+      pselect(pcmp_eq(one, a), maxnum, ndtri));
 }
 
 template <typename Scalar>
@@ -1387,7 +1385,7 @@ struct zeta_impl {
             };
 
         const Scalar maxnum = NumTraits<Scalar>::infinity();
-        const Scalar zero = 0.0, half = 0.5, one = 1.0;
+        const Scalar zero = Scalar(0.0), half = Scalar(0.5), one = Scalar(1.0);
         const Scalar machep = cephes_helper<Scalar>::machep();
         const Scalar nan = NumTraits<Scalar>::quiet_NaN();
 
@@ -1429,11 +1427,19 @@ struct zeta_impl {
             return s;
         }
 
+        // If b is zero, then the tail sum will also end up being zero.
+        // Exiting early here can prevent NaNs for some large inputs, where
+        // the tail sum computed below has term `a` which can overflow to `inf`.
+        if (numext::equal_strict(b, zero)) {
+          return s;
+        }
+
         w = a;
         s += b*w/(x-one);
         s -= half * b;
         a = one;
         k = zero;
+        
         for( i=0; i<12; i++ )
         {
             a *= x + k;
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
index 7dd3c3e5be6de84176da2459a860fec6c344c770..909b08e16503483b4d67ad0b43c214e12cefcc99 100644
--- a/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
@@ -4,6 +4,9 @@
 namespace Eigen {
 namespace internal {
 
+// Bessel functions only available for some compilers.
+#if EIGEN_HAS_AVX512_MATH
+
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
 
@@ -40,6 +43,8 @@ BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0)
 F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
 BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
 
+#endif
+
 }  // namespace internal
 }  // namespace Eigen
 
diff --git a/unsupported/doc/examples/SYCL/CMakeLists.txt b/unsupported/doc/examples/SYCL/CMakeLists.txt
index bef4f1925c6b0fb0666b8895883af8b315cbdbe1..1d0f721dc1228061edad1ba71680713fb38c7764 100644
--- a/unsupported/doc/examples/SYCL/CMakeLists.txt
+++ b/unsupported/doc/examples/SYCL/CMakeLists.txt
@@ -3,8 +3,7 @@ FILE(GLOB examples_SRCS "*.cpp")
 set(EIGEN_SYCL ON)
 list(APPEND CMAKE_EXE_LINKER_FLAGS -pthread)
 if(EIGEN_SYCL_TRISYCL)
-  set(CMAKE_CXX_STANDARD 14)
-  set(STD_CXX_FLAG "-std=c++1z")
+  set(CMAKE_CXX_STANDARD 17)
 else(EIGEN_SYCL_TRISYCL)
   if(MSVC)
     # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index 181919361e5d3a34c06828f8b62084255ce3bccc..e917dffb5cd7a3df3dea16f2e2c22de957aa74c0 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -55,13 +55,11 @@ ei_add_test(FFT)
 
 ei_add_test(EulerAngles)
 
-find_package(MPFR 2.3.0)
-find_package(GMP)
-if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11)
-  include_directories(${MPFR_INCLUDES} ./mpreal)
+find_package(MPREAL)
+if(MPREAL_FOUND AND EIGEN_COMPILER_SUPPORT_CPP11)
   ei_add_property(EIGEN_TESTED_BACKENDS "MPFR C++, ")
-  set(EIGEN_MPFR_TEST_LIBRARIES ${MPFR_LIBRARIES} ${GMP_LIBRARIES})
- ei_add_test(mpreal_support "-std=c++11" "${EIGEN_MPFR_TEST_LIBRARIES}" )
+  include_directories(${MPREAL_INCLUDES})
+  ei_add_test(mpreal_support "-std=c++11" "${MPREAL_LIBRARIES}" )
 else()
   ei_add_property(EIGEN_MISSING_BACKENDS "MPFR C++, ")
 endif()
@@ -165,8 +163,8 @@ if(EIGEN_TEST_CXX11)
     endif()
 
     if(EIGEN_SYCL_TRISYCL)
-      set(CMAKE_CXX_STANDARD 14)
-      set(STD_CXX_FLAG "-std=c++1z")
+      # triSYCL now requires c++17.
+      set(CMAKE_CXX_STANDARD 17)
     else()
       if(MSVC)
         # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11
@@ -292,8 +290,14 @@ endif()
 endif()
 
 # These tests needs nvcc
-find_package(CUDA 7.0)
-if(CUDA_FOUND AND EIGEN_TEST_CUDA)
+check_language(CUDA)
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+else()
+  message(STATUS "Could NOT find CUDA.")
+endif()
+
+if(CMAKE_CUDA_COMPILER AND EIGEN_TEST_CUDA)
   # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
   # and -fno-check-new flags since they trigger thousands of compilation warnings
   # in the CUDA runtime
@@ -304,30 +308,23 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
-  message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
-
-  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
-    set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
-  endif()
   if(EIGEN_TEST_CUDA_CLANG)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
     string(APPEND CMAKE_CXX_FLAGS " --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}")
     foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
         string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${ARCH}")
     endforeach()
+    string(APPEND CMAKE_CXX_FLAGS " ${EIGEN_CUDA_CXX_FLAGS}")
+  else()
+    set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+    set(NVCC_ARCH_FLAGS)
+    foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+      string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
+    endforeach()
+    set(CUDA_NVCC_FLAGS  "--expt-relaxed-constexpr -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS} ${EIGEN_CUDA_CXX_FLAGS}")
+    cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   endif()
-
-  set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
-  if (${CUDA_VERSION} STREQUAL "7.0")
-    set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
-  endif()
-
-  set(NVCC_ARCH_FLAGS)
-  foreach(ARCH IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
-    string(APPEND NVCC_ARCH_FLAGS " -gencode arch=compute_${ARCH},code=sm_${ARCH}")
-  endforeach()
-  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_RELAXED_CONSTEXPR} -Xcudafe \"--display_error_number\" ${NVCC_ARCH_FLAGS} ${CUDA_NVCC_FLAGS}")
-  cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
+  
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 
   ei_add_test(cxx11_tensor_complex_gpu)
diff --git a/unsupported/test/NonLinearOptimization.cpp b/unsupported/test/NonLinearOptimization.cpp
index c667b7247ddcd72d8bbac7150f4c024f2748cfcb..b6c29ca4daa4be2556acec3faf77414031a59e21 100644
--- a/unsupported/test/NonLinearOptimization.cpp
+++ b/unsupported/test/NonLinearOptimization.cpp
@@ -12,14 +12,10 @@
 // It is intended to be done for this test only.
 #include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
-// tolerance for chekcing number of iterations
-#define LM_EVAL_COUNT_TOL 4/3
+// tolerance for checking number of iterations
+#define LM_EVAL_COUNT_TOL 2
 
 #define LM_CHECK_N_ITERS(SOLVER,NFEV,NJEV) { \
-            ++g_test_level; \
-            VERIFY_IS_EQUAL(SOLVER.nfev, NFEV); \
-            VERIFY_IS_EQUAL(SOLVER.njev, NJEV); \
-            --g_test_level; \
             VERIFY(SOLVER.nfev <= NFEV * LM_EVAL_COUNT_TOL); \
             VERIFY(SOLVER.njev <= NJEV * LM_EVAL_COUNT_TOL); \
         }
@@ -186,9 +182,10 @@ void testLmder1()
   lmder_functor functor;
   LevenbergMarquardt<lmder_functor> lm(functor);
   info = lm.lmder1(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
@@ -214,9 +211,10 @@ void testLmder()
   lmder_functor functor;
   LevenbergMarquardt<lmder_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return values
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
@@ -298,9 +296,10 @@ void testHybrj1()
   hybrj_functor functor;
   HybridNonLinearSolver<hybrj_functor> solver(functor);
   info = solver.hybrj1(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(solver, 11, 1);
 
   // check norm
@@ -332,9 +331,10 @@ void testHybrj()
   solver.diag.setConstant(n, 1.);
   solver.useExternalScaling = true;
   info = solver.solve(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(solver, 11, 1);
 
   // check norm
@@ -385,10 +385,11 @@ void testHybrd1()
   hybrd_functor functor;
   HybridNonLinearSolver<hybrd_functor> solver(functor);
   info = solver.hybrd1(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(solver.nfev, 20);
+  // VERIFY_IS_EQUAL(info, 1);
+  VERIFY(solver.nfev <= 20*LM_EVAL_COUNT_TOL);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -416,10 +417,11 @@ void testHybrd()
   solver.diag.setConstant(n, 1.);
   solver.useExternalScaling = true;
   info = solver.solveNumericalDiff(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(solver.nfev, 14);
+  // VERIFY_IS_EQUAL(info, 1);
+  VERIFY(solver.nfev <= 14*LM_EVAL_COUNT_TOL);
 
   // check norm
   VERIFY_IS_APPROX(solver.fvec.blueNorm(), 1.192636e-08);
@@ -487,9 +489,10 @@ void testLmstr1()
   lmstr_functor functor;
   LevenbergMarquardt<lmstr_functor> lm(functor);
   info = lm.lmstr1(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
@@ -515,9 +518,10 @@ void testLmstr()
   lmstr_functor functor;
   LevenbergMarquardt<lmstr_functor> lm(functor);
   info = lm.minimizeOptimumStorage(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return values
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 6, 5);
 
   // check norm
@@ -570,10 +574,11 @@ void testLmdif1()
   lmdif_functor functor;
   DenseIndex nfev = -1; // initialize to avoid maybe-uninitialized warning
   info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(nfev, 26);
+  // VERIFY_IS_EQUAL(info, 1);
+  VERIFY( nfev <= 26*LM_EVAL_COUNT_TOL);
 
   // check norm
   functor(x, fvec);
@@ -601,10 +606,11 @@ void testLmdif()
   NumericalDiff<lmdif_functor> numDiff(functor);
   LevenbergMarquardt<NumericalDiff<lmdif_functor> > lm(numDiff);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return values
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev, 26);
+  // VERIFY_IS_EQUAL(info, 1);
+  VERIFY(lm.nfev <= 26*LM_EVAL_COUNT_TOL);
 
   // check norm
   fnorm = lm.fvec.blueNorm();
@@ -686,9 +692,10 @@ void testNistChwirut2(void)
   chwirut2_functor functor;
   LevenbergMarquardt<chwirut2_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 10, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
@@ -706,9 +713,10 @@ void testNistChwirut2(void)
   lm.parameters.ftol = 1.E6*NumTraits<double>::epsilon();
   lm.parameters.xtol = 1.E6*NumTraits<double>::epsilon();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 7, 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.1304802941E+02);
@@ -764,9 +772,10 @@ void testNistMisra1a(void)
   misra1a_functor functor;
   LevenbergMarquardt<misra1a_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 19, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
@@ -780,9 +789,10 @@ void testNistMisra1a(void)
   x<< 250., 0.0005;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 5, 4);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.2455138894E-01);
@@ -852,9 +862,10 @@ void testNistHahn1(void)
   hahn1_functor functor;
   LevenbergMarquardt<hahn1_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 11, 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
@@ -873,9 +884,10 @@ void testNistHahn1(void)
   x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 11, 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.5324382854E+00);
@@ -936,9 +948,10 @@ void testNistMisra1d(void)
   misra1d_functor functor;
   LevenbergMarquardt<misra1d_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 3);
+  // VERIFY_IS_EQUAL(info, 3);
   LM_CHECK_N_ITERS(lm, 9, 7);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
@@ -952,9 +965,10 @@ void testNistMisra1d(void)
   x<< 450., 0.0003;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 4, 3);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6419295283E-02);
@@ -1012,13 +1026,14 @@ void testNistLanczos1(void)
   lanczos1_functor functor;
   LevenbergMarquardt<lanczos1_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 2);
+  // VERIFY_IS_EQUAL(info, 2);
   LM_CHECK_N_ITERS(lm, 79, 72);
   // check norm^2
-  std::cout.precision(30);
-  std::cout << lm.fvec.squaredNorm() << "\n";
+  // std::cout.precision(30);
+  // std::cout << lm.fvec.squaredNorm() << "\n";
   VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
   // check x
   VERIFY_IS_APPROX(x[0], 9.5100000027E-02);
@@ -1034,9 +1049,10 @@ void testNistLanczos1(void)
   x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 2);
+  // VERIFY_IS_EQUAL(info, 2);
   LM_CHECK_N_ITERS(lm, 9, 8);
   // check norm^2
   VERIFY(lm.fvec.squaredNorm() <= 1.4307867721E-25);
@@ -1098,9 +1114,10 @@ void testNistRat42(void)
   rat42_functor functor;
   LevenbergMarquardt<rat42_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 10, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
@@ -1115,9 +1132,10 @@ void testNistRat42(void)
   x<< 75., 2.5, 0.07;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 6, 5);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.0565229338E+00);
@@ -1174,9 +1192,10 @@ void testNistMGH10(void)
   MGH10_functor functor;
   LevenbergMarquardt<MGH10_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 2); 
+  // VERIFY_IS_EQUAL(info, 2); 
   LM_CHECK_N_ITERS(lm, 284, 249); 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
@@ -1191,9 +1210,10 @@ void testNistMGH10(void)
   x<< 0.02, 4000., 250.;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 3);
+  // VERIFY_IS_EQUAL(info, 3);
   LM_CHECK_N_ITERS(lm, 126, 116);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7945855171E+01);
@@ -1251,9 +1271,10 @@ void testNistBoxBOD(void)
   lm.parameters.xtol = 1.E6*NumTraits<double>::epsilon();
   lm.parameters.factor = 10.;
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 31, 25);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
@@ -1270,10 +1291,11 @@ void testNistBoxBOD(void)
   lm.parameters.ftol = NumTraits<double>::epsilon();
   lm.parameters.xtol = NumTraits<double>::epsilon();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  LM_CHECK_N_ITERS(lm, 15, 14);
+  // VERIFY_IS_EQUAL(info, 1);
+  LM_CHECK_N_ITERS(lm, 20, 14);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.1680088766E+03);
   // check x
@@ -1331,6 +1353,7 @@ void testNistMGH17(void)
   lm.parameters.xtol = NumTraits<double>::epsilon();
   lm.parameters.maxfev = 1000;
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
@@ -1342,7 +1365,7 @@ void testNistMGH17(void)
   VERIFY_IS_APPROX(x[4], 2.2122699662E-02);
   
   // check return value
-  VERIFY_IS_EQUAL(info, 2); 
+  // VERIFY_IS_EQUAL(info, 2); 
   LM_CHECK_N_ITERS(lm, 602, 545);
 
   /*
@@ -1352,9 +1375,10 @@ void testNistMGH17(void)
   // do the computation
   lm.resetParameters();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 18, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.4648946975E-05);
@@ -1417,9 +1441,10 @@ void testNistMGH09(void)
   LevenbergMarquardt<MGH09_functor> lm(functor);
   lm.parameters.maxfev = 1000;
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 490, 376);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
@@ -1436,9 +1461,10 @@ void testNistMGH09(void)
   // do the computation
   lm.resetParameters();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 18, 16);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 3.0750560385E-04);
@@ -1501,9 +1527,10 @@ void testNistBennett5(void)
   LevenbergMarquardt<Bennett5_functor> lm(functor);
   lm.parameters.maxfev = 1000;
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 758, 744);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
@@ -1518,9 +1545,10 @@ void testNistBennett5(void)
   // do the computation
   lm.resetParameters();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 203, 192);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.2404744073E-04);
@@ -1587,9 +1615,10 @@ void testNistThurber(void)
   lm.parameters.ftol = 1.E4*NumTraits<double>::epsilon();
   lm.parameters.xtol = 1.E4*NumTraits<double>::epsilon();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 39,36);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
@@ -1611,9 +1640,10 @@ void testNistThurber(void)
   lm.parameters.ftol = 1.E4*NumTraits<double>::epsilon();
   lm.parameters.xtol = 1.E4*NumTraits<double>::epsilon();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 29, 28);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 5.6427082397E+03);
@@ -1677,9 +1707,10 @@ void testNistRat43(void)
   lm.parameters.ftol = 1.E6*NumTraits<double>::epsilon();
   lm.parameters.xtol = 1.E6*NumTraits<double>::epsilon();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 27, 20);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
@@ -1698,9 +1729,10 @@ void testNistRat43(void)
   lm.parameters.ftol = 1.E5*NumTraits<double>::epsilon();
   lm.parameters.xtol = 1.E5*NumTraits<double>::epsilon();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 9, 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 8.7864049080E+03);
@@ -1760,9 +1792,10 @@ void testNistEckerle4(void)
   eckerle4_functor functor;
   LevenbergMarquardt<eckerle4_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 18, 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
@@ -1777,9 +1810,10 @@ void testNistEckerle4(void)
   x<< 1.5, 5., 450.;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
   LM_CHECK_N_ITERS(lm, 7, 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec.squaredNorm(), 1.4635887487E-03);
diff --git a/unsupported/test/cxx11_tensor_block_eval.cpp b/unsupported/test/cxx11_tensor_block_eval.cpp
index b2e26ebb7334e417bbd957b74a9ff9cd500f0ecd..d66a63e0fdf8aa8cb181cb08455e0622c96d65d4 100644
--- a/unsupported/test/cxx11_tensor_block_eval.cpp
+++ b/unsupported/test/cxx11_tensor_block_eval.cpp
@@ -244,7 +244,7 @@ static void test_eval_tensor_binary_with_unary_expr_block() {
   rhs.setRandom();
 
   VerifyBlockEvaluator<T, NumDims, Layout>(
-      (lhs.square() + rhs.square()).sqrt(),
+      (lhs.abs() + rhs.abs()).sqrt(),
       [&dims]() { return RandomBlock<Layout>(dims, 1, 10); });
 }
 
diff --git a/unsupported/test/cxx11_tensor_broadcasting.cpp b/unsupported/test/cxx11_tensor_broadcasting.cpp
index d3dab891f20d4f0f6eae7bc7b245d408df3f7a2c..cbd92c328e1cc2e6c7d6348b08e35f9d80a0ba21 100644
--- a/unsupported/test/cxx11_tensor_broadcasting.cpp
+++ b/unsupported/test/cxx11_tensor_broadcasting.cpp
@@ -256,6 +256,22 @@ static void test_simple_broadcasting_n_by_one()
   }
 }
 
+template <int DataLayout>
+static void test_size_one_broadcasting()
+{
+  Tensor<float, 1, DataLayout> tensor(1);
+  tensor.setRandom();
+  array<ptrdiff_t, 1> broadcasts = {64};
+  Tensor<float, 1, DataLayout> broadcast;
+  broadcast = tensor.broadcast(broadcasts);
+
+  VERIFY_IS_EQUAL(broadcast.dimension(0), broadcasts[0]);
+
+  for (int i = 0; i < broadcasts[0]; ++i) {
+    VERIFY_IS_EQUAL(tensor(0), broadcast(i));
+  }
+}
+
 template <int DataLayout>
 static void test_simple_broadcasting_one_by_n_by_one_1d()
 {
@@ -328,4 +344,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_broadcasting)
   CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<ColMajor>());
   CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_1d<RowMajor>());
   CALL_SUBTEST(test_simple_broadcasting_one_by_n_by_one_2d<RowMajor>());
+  CALL_SUBTEST(test_size_one_broadcasting<ColMajor>());
+  CALL_SUBTEST(test_size_one_broadcasting<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu
index 575bdc1f9594ab30d80887c9e0badc5c4f202b26..5abf2131dbbc71bf7fd376cf1b2910b873b69ddb 100644
--- a/unsupported/test/cxx11_tensor_contract_gpu.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu
@@ -25,10 +25,6 @@ typedef Tensor<float, 1>::DimensionPair DimPair;
 template<int DataLayout>
 void test_gpu_contraction(int m_size, int k_size, int n_size)
 {
-  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
-  // with these dimensions, the output has 300 * 140 elements, which is
-  // more than 30 * 1024, which is the number of threads in blocks on
-  // a 15 SM GK110 GPU
   Tensor<float, 2, DataLayout> t_left(m_size, k_size);
   Tensor<float, 2, DataLayout> t_right(k_size, n_size);
   Tensor<float, 2, DataLayout> t_result(m_size, n_size);
@@ -171,25 +167,45 @@ void test_gpu_contraction_n() {
 
 template<int DataLayout>
 void test_gpu_contraction_sizes() {
-  int m_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257 , 511,
-                   512, 513, 1023, 1024, 1025};
-
-  int n_sizes[] = { 31,  39,   63,   64,   65,
-                   127, 129,  255,  257,  511,
-                   512, 513, 1023, 1024, 1025};
-
-  int k_sizes[] = {  31,   39,  63,  64,   65,
-                     95,   96, 127, 129,  255,
-                    257,  511, 512, 513, 1023,
-                   1024, 1025};
-
-  for (int i = 0; i < 15; i++) {
-    for (int j = 0; j < 15; j++) {
-      for (int k = 0; k < 17; k++) {
-        test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+  int m_sizes[3][5] = {{ 31,  39,   63,   64,   65},
+                       {127, 129,  255,  257 , 511},
+                       {512, 513, 1023, 1024, 1025}};
+
+  int n_sizes[3][5] = {{ 31,  39,   63,   64,   65},
+                       {127, 129,  255,  257,  511},
+                       {512, 513, 1023, 1024, 1025}};
+
+  int k_sizes[3][6] = {{ 31,   39,  63,  64,   65,   95},
+                       { 96, 127, 129,  255,  257,  511},
+                       {512, 513, 725, 1023, 1024, 1025}};
+
+  // Some selection of specific cases.
+  //  - m changes rows each iteration
+  //  - n changes rows each 3 iterations
+  //  - k changes rows each 9 iterations
+  //  - within a row, advance once column each iteration
+  const int m_cols = 5;
+  const int n_cols = 5;
+  const int k_cols = 6;
+  int m_offset = 0;
+  int n_offset = 1;
+  int k_offset = 2;
+  for (int i = 0; i < 3; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int l = 0; l < 3; ++l) {
+        int m = m_sizes[l][m_offset];
+        int n = n_sizes[j][n_offset];
+        int k = k_sizes[i][k_offset];
+        test_gpu_contraction<DataLayout>(m, n, k);
+        n_offset = (n_offset + 1) % n_cols;
+        k_offset = (k_offset + 1) % k_cols;
+      }
+      m_offset = (m_offset + 1) % m_cols;
+      if (j < 2) {
+        n_offset = (n_offset + n_cols - 3) % n_cols;  // Rewind 3.
       }
     }
+    k_offset = (k_offset + 2 * k_cols - 9) % k_cols;  // Rewind 9.
   }
 }
 
diff --git a/unsupported/test/cxx11_tensor_expr.cpp b/unsupported/test/cxx11_tensor_expr.cpp
index 169fc1898a6803a5ff7b617981e65068fd524ae8..27c284514e7bc78ad47abf480cd603ba20c866a9 100644
--- a/unsupported/test/cxx11_tensor_expr.cpp
+++ b/unsupported/test/cxx11_tensor_expr.cpp
@@ -305,10 +305,10 @@ void test_minmax_nan_propagation_templ() {
     const Scalar kNaN = std::numeric_limits<Scalar>::quiet_NaN();
     const Scalar kInf = std::numeric_limits<Scalar>::infinity();
     const Scalar kZero(0);
-    Tensor<Scalar, 1> vec_all_nan(size);
+    Tensor<Scalar, 1> vec_full_nan(size);
     Tensor<Scalar, 1> vec_one_nan(size);
     Tensor<Scalar, 1> vec_zero(size);
-    vec_all_nan.setConstant(kNaN);
+    vec_full_nan.setConstant(kNaN);
     vec_zero.setZero();
     vec_one_nan.setZero();
     vec_one_nan(size/2) = kNaN;
@@ -330,12 +330,12 @@ void test_minmax_nan_propagation_templ() {
     // max(nan, 0) = nan
     // max(0, nan) = nan
     // max(0, 0) = 0
-    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kNaN));
-    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_all_nan));
-    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(kZero));
-    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNaN>(kNaN));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNaN>(vec_full_nan));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNaN>(kZero));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNaN>(vec_zero));
     verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(kNaN));
-    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(vec_all_nan));
+    verify_all_nan(vec_zero.template cwiseMax<PropagateNaN>(vec_full_nan));
     verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(kZero));
     verify_all_zero(vec_zero.template cwiseMax<PropagateNaN>(vec_zero));
 
@@ -344,12 +344,12 @@ void test_minmax_nan_propagation_templ() {
     // max(nan, 0) = 0
     // max(0, nan) = 0
     // max(0, 0) = 0
-    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(kNaN));
-    verify_all_nan(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_all_nan));
-    verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(kZero));
-    verify_all_zero(vec_all_nan.template cwiseMax<PropagateNumbers>(vec_zero));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_full_nan.template cwiseMax<PropagateNumbers>(vec_full_nan));
+    verify_all_zero(vec_full_nan.template cwiseMax<PropagateNumbers>(kZero));
+    verify_all_zero(vec_full_nan.template cwiseMax<PropagateNumbers>(vec_zero));
     verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kNaN));
-    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_full_nan));
     verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(kZero));
     verify_all_zero(vec_zero.template cwiseMax<PropagateNumbers>(vec_zero));
 
@@ -358,12 +358,12 @@ void test_minmax_nan_propagation_templ() {
     // min(nan, 0) = nan
     // min(0, nan) = nan
     // min(0, 0) = 0
-    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kNaN));
-    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_all_nan));
-    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(kZero));
-    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNaN>(vec_zero));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNaN>(kNaN));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNaN>(vec_full_nan));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNaN>(kZero));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNaN>(vec_zero));
     verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(kNaN));
-    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(vec_all_nan));
+    verify_all_nan(vec_zero.template cwiseMin<PropagateNaN>(vec_full_nan));
     verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(kZero));
     verify_all_zero(vec_zero.template cwiseMin<PropagateNaN>(vec_zero));
 
@@ -372,12 +372,12 @@ void test_minmax_nan_propagation_templ() {
     // min(nan, 0) = 0
     // min(0, nan) = 0
     // min(0, 0) = 0
-    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(kNaN));
-    verify_all_nan(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_all_nan));
-    verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(kZero));
-    verify_all_zero(vec_all_nan.template cwiseMin<PropagateNumbers>(vec_zero));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNumbers>(kNaN));
+    verify_all_nan(vec_full_nan.template cwiseMin<PropagateNumbers>(vec_full_nan));
+    verify_all_zero(vec_full_nan.template cwiseMin<PropagateNumbers>(kZero));
+    verify_all_zero(vec_full_nan.template cwiseMin<PropagateNumbers>(vec_zero));
     verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kNaN));
-    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_all_nan));
+    verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_full_nan));
     verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(kZero));
     verify_all_zero(vec_zero.template cwiseMin<PropagateNumbers>(vec_zero));
 
@@ -397,13 +397,13 @@ void test_minmax_nan_propagation_templ() {
     VERIFY_IS_EQUAL(val(), kZero);
 
     // Test NaN propagation for tensor of all NaNs.
-    val = vec_all_nan.template minimum<PropagateNaN>();
+    val = vec_full_nan.template minimum<PropagateNaN>();
     VERIFY((numext::isnan)(val()));
-    val = vec_all_nan.template minimum<PropagateNumbers>();
+    val = vec_full_nan.template minimum<PropagateNumbers>();
     VERIFY_IS_EQUAL(val(), kInf);
-    val = vec_all_nan.template maximum<PropagateNaN>();
+    val = vec_full_nan.template maximum<PropagateNaN>();
     VERIFY((numext::isnan)(val()));
-    val = vec_all_nan.template maximum<PropagateNumbers>();
+    val = vec_full_nan.template maximum<PropagateNumbers>();
     VERIFY_IS_EQUAL(val(), -kInf);
 
     // Test NaN propagation for tensor with a single NaN.
diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu
index 137d0d5969fbd6e3b545db01f6e0d09e7f59ed72..0a37c02937b263b5b0ec6540f46d3b77bea25c26 100644
--- a/unsupported/test/cxx11_tensor_gpu.cu
+++ b/unsupported/test/cxx11_tensor_gpu.cu
@@ -681,8 +681,8 @@ void test_gpu_digamma()
   expected_out(2) = Scalar(1.2561176684318);
   expected_out(3) = Scalar(2.398239129535781);
   expected_out(4) = Scalar(9.210340372392849);
-  expected_out(5) = std::numeric_limits<Scalar>::infinity();
-  expected_out(6) = std::numeric_limits<Scalar>::infinity();
+  expected_out(5) = std::numeric_limits<Scalar>::quiet_NaN();
+  expected_out(6) = std::numeric_limits<Scalar>::quiet_NaN();
 
   std::size_t bytes = in.size() * sizeof(Scalar);
 
@@ -704,11 +704,8 @@ void test_gpu_digamma()
   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  for (int i = 0; i < 5; ++i) {
-    VERIFY_IS_APPROX(out(i), expected_out(i));
-  }
-  for (int i = 5; i < 7; ++i) {
-    VERIFY_IS_EQUAL(out(i), expected_out(i));
+  for (int i = 0; i < 7; ++i) {
+    VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
   }
 
   gpuFree(d_in);
@@ -741,7 +738,7 @@ void test_gpu_zeta()
   expected_out(0) = std::numeric_limits<Scalar>::infinity();
   expected_out(1) = Scalar(1.61237534869);
   expected_out(2) = Scalar(0.234848505667);
-  expected_out(3) = Scalar(1.03086757337e-5);
+  expected_out(3) = std::numeric_limits<Scalar>::quiet_NaN();
   expected_out(4) = Scalar(0.367879440865);
   expected_out(5) = Scalar(0.054102025820864097);
 
@@ -769,13 +766,8 @@ void test_gpu_zeta()
   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  VERIFY_IS_EQUAL(out(0), expected_out(0));
-  VERIFY((std::isnan)(out(3)));
-
-  for (int i = 1; i < 6; ++i) {
-    if (i != 3) {
-      VERIFY_IS_APPROX(out(i), expected_out(i));
-    }
+  for (int i = 0; i < 6; ++i) {
+    VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
   }
 
   gpuFree(d_in_x);
@@ -1117,13 +1109,8 @@ void test_gpu_ndtri()
   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  VERIFY_IS_EQUAL(out(0), expected_out(0));
-  VERIFY((std::isnan)(out(3)));
-
-  for (int i = 1; i < 6; ++i) {
-    if (i != 3) {
-      VERIFY_IS_APPROX(out(i), expected_out(i));
-    }
+  for (int i = 0; i < 6; ++i) {    
+    VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
   }
 
   gpuFree(d_in_x);
@@ -1262,12 +1249,8 @@ void test_gpu_betainc()
   assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess);
   assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess);
 
-  for (int i = 1; i < 125; ++i) {
-    if ((std::isnan)(expected_out(i))) {
-      VERIFY((std::isnan)(out(i)));
-    } else {
-      VERIFY_IS_APPROX(out(i), expected_out(i));
-    }
+  for (int i = 0; i < 125; ++i) {
+    VERIFY_IS_CWISE_APPROX(out(i), expected_out(i));
   }
 
   gpuFree(d_in_x);
diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
index 062f76e26d40278004ff751937af7468b17c050c..e11782a794e1d04f862f42b7eeb75c06f5c7c023 100644
--- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu
@@ -113,7 +113,7 @@ void test_gpu_unary() {
 
   gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
   gpu_res_float.device(gpu_device) = gpu_float.abs();
-  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().cast<float>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().template cast<float>();
 
   Tensor<float, 1> half_prec(num_elem);
   Tensor<float, 1> full_prec(num_elem);
@@ -154,7 +154,7 @@ void test_gpu_elementwise() {
   gpu_float1.device(gpu_device) = gpu_float1.random();
   gpu_float2.device(gpu_device) = gpu_float2.random();
   gpu_res_float.device(gpu_device) = (gpu_float1 + gpu_float2) * gpu_float1;
-  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).cast<float>();
+  gpu_res_half.device(gpu_device) = ((gpu_float1.cast<Eigen::half>() + gpu_float2.cast<Eigen::half>()) * gpu_float1.cast<Eigen::half>()).template cast<float>();
 
   Tensor<float, 1> half_prec(num_elem);
   Tensor<float, 1> full_prec(num_elem);
@@ -329,26 +329,22 @@ void test_gpu_reductions(int size1, int size2, int redux) {
   int num_elem = size1*size2;
   int result_size = (redux == 1 ? size1 : size2);
 
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(result_size * sizeof(Eigen::half));
 
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size1, size2);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size1, size2);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size1, size2);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_half(
       d_res_half, result_size);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res_float(
       d_res_float, result_size);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random() * 2.0f;
-  gpu_float2.device(gpu_device) = gpu_float2.random() * 2.0f;
+  gpu_float.device(gpu_device) = gpu_float.random() * 2.0f;
 
   Eigen::array<int, 1> redux_dim = {redux};
-  gpu_res_float.device(gpu_device) = gpu_float1.sum(redux_dim).cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum(redux_dim);
+  gpu_res_float.device(gpu_device) = gpu_float.sum(redux_dim).cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum(redux_dim);
 
   Tensor<Eigen::half, 1> half_prec(result_size);
   Tensor<Eigen::half, 1> full_prec(result_size);
@@ -361,8 +357,7 @@ void test_gpu_reductions(int size1, int size2, int redux) {
     VERIFY_IS_APPROX(full_prec(i), half_prec(i));
   }
 
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
@@ -386,25 +381,21 @@ void test_gpu_full_reductions() {
   int size = 13;
   int num_elem = size*size;
 
-  float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
-  float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res_half = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
   Eigen::half* d_res_float = (Eigen::half*)gpu_device.allocate(1 * sizeof(Eigen::half));
 
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float1(
-      d_float1, size, size);
-  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float2(
-      d_float2, size, size);
+  Eigen::TensorMap<Eigen::Tensor<float, 2>, Eigen::Aligned> gpu_float(
+      d_float, size, size);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_half(
       d_res_half);
   Eigen::TensorMap<Eigen::Tensor<Eigen::half, 0>, Eigen::Aligned> gpu_res_float(
       d_res_float);
 
-  gpu_float1.device(gpu_device) = gpu_float1.random();
-  gpu_float2.device(gpu_device) = gpu_float2.random();
+  gpu_float.device(gpu_device) = gpu_float.random();
 
-  gpu_res_float.device(gpu_device) = gpu_float1.sum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().sum();
+  gpu_res_float.device(gpu_device) = gpu_float.sum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().sum();
 
   Tensor<Eigen::half, 0> half_prec;
   Tensor<Eigen::half, 0> full_prec;
@@ -414,16 +405,15 @@ void test_gpu_full_reductions() {
 
   VERIFY_IS_APPROX(full_prec(), half_prec());
 
-  gpu_res_float.device(gpu_device) = gpu_float1.maximum().cast<Eigen::half>();
-  gpu_res_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().maximum();
+  gpu_res_float.device(gpu_device) = gpu_float.maximum().cast<Eigen::half>();
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().maximum();
   gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, sizeof(Eigen::half));
   gpu_device.synchronize();
 
   VERIFY_IS_APPROX(full_prec(), half_prec());
 
-  gpu_device.deallocate(d_float1);
-  gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float);
   gpu_device.deallocate(d_res_half);
   gpu_device.deallocate(d_res_float);
 }
@@ -454,8 +444,8 @@ void test_gpu_forced_evals() {
 
   gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
   gpu_res_float.device(gpu_device) = gpu_float.abs();
-  gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().cast<float>();
-  gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().cast<float>();
+  gpu_res_half1.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().eval().template cast<float>();
+  gpu_res_half2.device(gpu_device) = gpu_float.cast<Eigen::half>().abs().broadcast(no_bcast).eval().template cast<float>();
 
   Tensor<float, 1> half_prec1(num_elem);
   Tensor<float, 1> half_prec2(num_elem);
diff --git a/unsupported/test/cxx11_tensor_shuffling.cpp b/unsupported/test/cxx11_tensor_shuffling.cpp
index 2ec85d2d4805c66548c47879ed986bc6ef112d25..89a64c02172d1d259e88a163c3bba6aadd21c4a9 100644
--- a/unsupported/test/cxx11_tensor_shuffling.cpp
+++ b/unsupported/test/cxx11_tensor_shuffling.cpp
@@ -215,6 +215,59 @@ static void test_shuffle_unshuffle()
 }
 
 
+template <int DataLayout>
+static void test_empty_shuffling()
+{
+  Tensor<float, 4, DataLayout> tensor(2,3,0,7);
+  tensor.setRandom();
+  array<ptrdiff_t, 4> shuffles;
+  shuffles[0] = 0;
+  shuffles[1] = 1;
+  shuffles[2] = 2;
+  shuffles[3] = 3;
+
+  Tensor<float, 4, DataLayout> no_shuffle;
+  no_shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(no_shuffle.dimension(0), 2);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(1), 3);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(2), 0);
+  VERIFY_IS_EQUAL(no_shuffle.dimension(3), 7);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l));
+        }
+      }
+    }
+  }
+
+  shuffles[0] = 2;
+  shuffles[1] = 3;
+  shuffles[2] = 1;
+  shuffles[3] = 0;
+  Tensor<float, 4, DataLayout> shuffle;
+  shuffle = tensor.shuffle(shuffles);
+
+  VERIFY_IS_EQUAL(shuffle.dimension(0), 0);
+  VERIFY_IS_EQUAL(shuffle.dimension(1), 7);
+  VERIFY_IS_EQUAL(shuffle.dimension(2), 3);
+  VERIFY_IS_EQUAL(shuffle.dimension(3), 2);
+
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 3; ++j) {
+      for (int k = 0; k < 0; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i));
+        }
+      }
+    }
+  }
+}
+
+
 EIGEN_DECLARE_TEST(cxx11_tensor_shuffling)
 {
   CALL_SUBTEST(test_simple_shuffling<ColMajor>());
@@ -225,4 +278,6 @@ EIGEN_DECLARE_TEST(cxx11_tensor_shuffling)
   CALL_SUBTEST(test_shuffling_as_value<RowMajor>());
   CALL_SUBTEST(test_shuffle_unshuffle<ColMajor>());
   CALL_SUBTEST(test_shuffle_unshuffle<RowMajor>());
+  CALL_SUBTEST(test_empty_shuffling<ColMajor>());
+  CALL_SUBTEST(test_empty_shuffling<RowMajor>());
 }
diff --git a/unsupported/test/levenberg_marquardt.cpp b/unsupported/test/levenberg_marquardt.cpp
index 7f9a81cd3063ef98753bd51c4de5984685db88ad..d0748d13ace34a52228261f8c907472af75295a2 100644
--- a/unsupported/test/levenberg_marquardt.cpp
+++ b/unsupported/test/levenberg_marquardt.cpp
@@ -24,7 +24,7 @@
 using std::sqrt;
 
 // tolerance for chekcing number of iterations
-#define LM_EVAL_COUNT_TOL 4/3
+#define LM_EVAL_COUNT_TOL 2
 
 struct lmder_functor : DenseFunctor<double>
 {
@@ -75,11 +75,11 @@ void testLmder1()
   lmder_functor functor;
   LevenbergMarquardt<lmder_functor> lm(functor);
   info = lm.lmder1(x);
-
+  EIGEN_UNUSED_VARIABLE(info)
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 6);
-  VERIFY_IS_EQUAL(lm.njev(), 5);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 6);
+  // VERIFY_IS_EQUAL(lm.njev(), 5);
 
   // check norm
   VERIFY_IS_APPROX(lm.fvec().blueNorm(), 0.09063596);
@@ -104,11 +104,12 @@ void testLmder()
   lmder_functor functor;
   LevenbergMarquardt<lmder_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return values
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 6);
-  VERIFY_IS_EQUAL(lm.njev(), 5);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 6);
+  // VERIFY_IS_EQUAL(lm.njev(), 5);
 
   // check norm
   fnorm = lm.fvec().blueNorm();
@@ -177,9 +178,10 @@ void testLmdif1()
   lmdif_functor functor;
   DenseIndex nfev;
   info = LevenbergMarquardt<lmdif_functor>::lmdif1(functor, x, &nfev);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
 //   VERIFY_IS_EQUAL(nfev, 26);
 
   // check norm
@@ -208,9 +210,10 @@ void testLmdif()
   NumericalDiff<lmdif_functor> numDiff(functor);
   LevenbergMarquardt<NumericalDiff<lmdif_functor> > lm(numDiff);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return values
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
 //   VERIFY_IS_EQUAL(lm.nfev(), 26);
 
   // check norm
@@ -293,11 +296,12 @@ void testNistChwirut2(void)
   chwirut2_functor functor;
   LevenbergMarquardt<chwirut2_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
 //   VERIFY_IS_EQUAL(lm.nfev(), 10);
-  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02);
   // check x
@@ -314,11 +318,12 @@ void testNistChwirut2(void)
   lm.setFtol(1.E6*NumTraits<double>::epsilon());
   lm.setXtol(1.E6*NumTraits<double>::epsilon());
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
 //   VERIFY_IS_EQUAL(lm.nfev(), 7);
-  VERIFY_IS_EQUAL(lm.njev(), 6);
+  // VERIFY_IS_EQUAL(lm.njev(), 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.1304802941E+02);
   // check x
@@ -373,11 +378,12 @@ void testNistMisra1a(void)
   misra1a_functor functor;
   LevenbergMarquardt<misra1a_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 19);
-  VERIFY_IS_EQUAL(lm.njev(), 15);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 19);
+  // VERIFY_IS_EQUAL(lm.njev(), 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01);
   // check x
@@ -390,11 +396,12 @@ void testNistMisra1a(void)
   x<< 250., 0.0005;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 5);
-  VERIFY_IS_EQUAL(lm.njev(), 4);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 5);
+  // VERIFY_IS_EQUAL(lm.njev(), 4);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.2455138894E-01);
   // check x
@@ -464,11 +471,12 @@ void testNistHahn1(void)
   hahn1_functor functor;
   LevenbergMarquardt<hahn1_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 11);
-  VERIFY_IS_EQUAL(lm.njev(), 10);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 11);
+  // VERIFY_IS_EQUAL(lm.njev(), 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00);
   // check x
@@ -486,11 +494,12 @@ void testNistHahn1(void)
   x<< .1, -.1, .005, -.000001, -.005, .0001, -.0000001;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(info, 1);
 //   VERIFY_IS_EQUAL(lm.nfev(), 11);
-  VERIFY_IS_EQUAL(lm.njev(), 10);
+  // VERIFY_IS_EQUAL(lm.njev(), 10);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.5324382854E+00);
   // check x
@@ -550,11 +559,12 @@ void testNistMisra1d(void)
   misra1d_functor functor;
   LevenbergMarquardt<misra1d_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 9);
-  VERIFY_IS_EQUAL(lm.njev(), 7);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 9);
+  // VERIFY_IS_EQUAL(lm.njev(), 7);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02);
   // check x
@@ -567,11 +577,12 @@ void testNistMisra1d(void)
   x<< 450., 0.0003;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 4);
-  VERIFY_IS_EQUAL(lm.njev(), 3);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 4);
+  // VERIFY_IS_EQUAL(lm.njev(), 3);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6419295283E-02);
   // check x
@@ -628,11 +639,12 @@ void testNistLanczos1(void)
   lanczos1_functor functor;
   LevenbergMarquardt<lanczos1_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
-  VERIFY_IS_EQUAL(lm.nfev(), 79);
-  VERIFY_IS_EQUAL(lm.njev(), 72);
+  // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
+  // VERIFY_IS_EQUAL(lm.nfev(), 79);
+  // VERIFY_IS_EQUAL(lm.njev(), 72);
   // check norm^2
   VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
@@ -649,11 +661,12 @@ void testNistLanczos1(void)
   x<< 0.5, 0.7, 3.6, 4.2, 4., 6.3;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
-  VERIFY_IS_EQUAL(lm.nfev(), 9);
-  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeErrorTooSmall);
+  // VERIFY_IS_EQUAL(lm.nfev(), 9);
+  // VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
   VERIFY(lm.fvec().squaredNorm() <= 1.4307867721E-25);
   // check x
@@ -714,11 +727,12 @@ void testNistRat42(void)
   rat42_functor functor;
   LevenbergMarquardt<rat42_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
-  VERIFY_IS_EQUAL(lm.nfev(), 10);
-  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // VERIFY_IS_EQUAL(lm.nfev(), 10);
+  // VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00);
   // check x
@@ -732,11 +746,12 @@ void testNistRat42(void)
   x<< 75., 2.5, 0.07;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
-  VERIFY_IS_EQUAL(lm.nfev(), 6);
-  VERIFY_IS_EQUAL(lm.njev(), 5);
+  // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // VERIFY_IS_EQUAL(lm.nfev(), 6);
+  // VERIFY_IS_EQUAL(lm.njev(), 5);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.0565229338E+00);
   // check x
@@ -787,14 +802,15 @@ void testNistMGH10(void)
   /*
    * First try
    */
-  x<< 2., 400000., 25000.;
+  x << 2., 400000., 25000.;
   // do the computation
   MGH10_functor functor;
   LevenbergMarquardt<MGH10_functor> lm(functor);
   info = lm.minimize(x);
-  ++g_test_level;
-  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
-  --g_test_level;
+  EIGEN_UNUSED_VARIABLE(info)
+  // ++g_test_level;
+  // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // --g_test_level;
   // was: VERIFY_IS_EQUAL(info, 1);
 
   // check norm^2
@@ -805,11 +821,11 @@ void testNistMGH10(void)
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
   
   // check return value
-
-  ++g_test_level;
-  VERIFY_IS_EQUAL(lm.nfev(), 284 );
-  VERIFY_IS_EQUAL(lm.njev(), 249 );
-  --g_test_level;
+  
+  // ++g_test_level;
+  // VERIFY_IS_EQUAL(lm.nfev(), 284 );
+  // VERIFY_IS_EQUAL(lm.njev(), 249 );
+  // --g_test_level;
   VERIFY(lm.nfev() < 284 * LM_EVAL_COUNT_TOL);
   VERIFY(lm.njev() < 249 * LM_EVAL_COUNT_TOL);
 
@@ -819,11 +835,12 @@ void testNistMGH10(void)
   x<< 0.02, 4000., 250.;
   // do the computation
   info = lm.minimize(x);
-  ++g_test_level;
-  VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
-  // was: VERIFY_IS_EQUAL(info, 1);
-  --g_test_level;
-
+  EIGEN_UNUSED_VARIABLE(info)
+  // ++g_test_level;
+  // VERIFY_IS_EQUAL(info, LevenbergMarquardtSpace::RelativeReductionTooSmall);
+  // // was: VERIFY_IS_EQUAL(info, 1);
+  // --g_test_level;
+  
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7945855171E+01);
   // check x
@@ -832,10 +849,10 @@ void testNistMGH10(void)
   VERIFY_IS_APPROX(x[2], 3.4522363462E+02);
   
   // check return value
-  ++g_test_level;
-  VERIFY_IS_EQUAL(lm.nfev(), 126);
-  VERIFY_IS_EQUAL(lm.njev(), 116);
-  --g_test_level;
+  // ++g_test_level;
+  // VERIFY_IS_EQUAL(lm.nfev(), 126);
+  // VERIFY_IS_EQUAL(lm.njev(), 116);
+  // --g_test_level;
   VERIFY(lm.nfev() < 126 * LM_EVAL_COUNT_TOL);
   VERIFY(lm.njev() < 116 * LM_EVAL_COUNT_TOL);
 }
@@ -888,6 +905,7 @@ void testNistBoxBOD(void)
   lm.setXtol(1.E6*NumTraits<double>::epsilon());
   lm.setFactor(10);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.1680088766E+03);
@@ -896,9 +914,9 @@ void testNistBoxBOD(void)
   VERIFY_IS_APPROX(x[1], 5.4723748542E-01);
   
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY(lm.nfev() < 31); // 31
-  VERIFY(lm.njev() < 25); // 25
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY(lm.nfev() < 31); // 31
+  // VERIFY(lm.njev() < 25); // 25
 
   /*
    * Second try
@@ -909,13 +927,14 @@ void testNistBoxBOD(void)
   lm.setFtol(NumTraits<double>::epsilon());
   lm.setXtol( NumTraits<double>::epsilon());
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1); 
-  ++g_test_level;
-  VERIFY_IS_EQUAL(lm.nfev(), 16 );
-  VERIFY_IS_EQUAL(lm.njev(), 15 );
-  --g_test_level;
+  // VERIFY_IS_EQUAL(info, 1);
+  // ++g_test_level;
+  // VERIFY_IS_EQUAL(lm.nfev(), 16 );
+  // VERIFY_IS_EQUAL(lm.njev(), 15 );
+  // --g_test_level;
   VERIFY(lm.nfev() < 16 * LM_EVAL_COUNT_TOL);
   VERIFY(lm.njev() < 15 * LM_EVAL_COUNT_TOL);
   // check norm^2
@@ -975,6 +994,7 @@ void testNistMGH17(void)
   lm.setXtol(NumTraits<double>::epsilon());
   lm.setMaxfev(1000);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05);
@@ -987,8 +1007,8 @@ void testNistMGH17(void)
   
     // check return value
 //   VERIFY_IS_EQUAL(info, 2);  //FIXME Use (lm.info() == Success)
-  VERIFY(lm.nfev() < 700 ); // 602
-  VERIFY(lm.njev() < 600 ); // 545
+  // VERIFY(lm.nfev() < 700 ); // 602
+  // VERIFY(lm.njev() < 600 ); // 545
 
   /*
    * Second try
@@ -997,11 +1017,12 @@ void testNistMGH17(void)
   // do the computation
   lm.resetParameters();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 18);
-  VERIFY_IS_EQUAL(lm.njev(), 15);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 18);
+  // VERIFY_IS_EQUAL(lm.njev(), 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.4648946975E-05);
   // check x
@@ -1063,6 +1084,7 @@ void testNistMGH09(void)
   LevenbergMarquardt<MGH09_functor> lm(functor);
   lm.setMaxfev(1000);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04);
@@ -1072,9 +1094,9 @@ void testNistMGH09(void)
   VERIFY_IS_APPROX(x[2], 0.12305309914); // should be 1.2305650693E-01
   VERIFY_IS_APPROX(x[3], 0.13605395375); // should be 1.3606233068E-01
   // check return value
-  VERIFY_IS_EQUAL(info, 1); 
-  VERIFY(lm.nfev() < 510 ); // 490
-  VERIFY(lm.njev() < 400 ); // 376
+  // VERIFY_IS_EQUAL(info, 1); 
+  // VERIFY(lm.nfev() < 510 ); // 490
+  // VERIFY(lm.njev() < 400 ); // 376
 
   /*
    * Second try
@@ -1083,11 +1105,12 @@ void testNistMGH09(void)
   // do the computation
   lm.resetParameters();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 18);
-  VERIFY_IS_EQUAL(lm.njev(), 16);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 18);
+  // VERIFY_IS_EQUAL(lm.njev(), 16);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 3.0750560385E-04);
   // check x
@@ -1149,11 +1172,12 @@ void testNistBennett5(void)
   LevenbergMarquardt<Bennett5_functor> lm(functor);
   lm.setMaxfev(1000);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 758);
-  VERIFY_IS_EQUAL(lm.njev(), 744);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 758);
+  // VERIFY_IS_EQUAL(lm.njev(), 744);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04);
   // check x
@@ -1167,11 +1191,12 @@ void testNistBennett5(void)
   // do the computation
   lm.resetParameters();
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 203);
-  VERIFY_IS_EQUAL(lm.njev(), 192);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 203);
+  // VERIFY_IS_EQUAL(lm.njev(), 192);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.2404744073E-04);
   // check x
@@ -1237,11 +1262,12 @@ void testNistThurber(void)
   lm.setFtol(1.E4*NumTraits<double>::epsilon());
   lm.setXtol(1.E4*NumTraits<double>::epsilon());
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 39);
-  VERIFY_IS_EQUAL(lm.njev(), 36);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 39);
+  // VERIFY_IS_EQUAL(lm.njev(), 36);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03);
   // check x
@@ -1262,11 +1288,12 @@ void testNistThurber(void)
   lm.setFtol(1.E4*NumTraits<double>::epsilon());
   lm.setXtol(1.E4*NumTraits<double>::epsilon());
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 29);
-  VERIFY_IS_EQUAL(lm.njev(), 28);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 29);
+  // VERIFY_IS_EQUAL(lm.njev(), 28);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 5.6427082397E+03);
   // check x
@@ -1329,11 +1356,12 @@ void testNistRat43(void)
   lm.setFtol(1.E6*NumTraits<double>::epsilon());
   lm.setXtol(1.E6*NumTraits<double>::epsilon());
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 27);
-  VERIFY_IS_EQUAL(lm.njev(), 20);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 27);
+  // VERIFY_IS_EQUAL(lm.njev(), 20);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03);
   // check x
@@ -1351,11 +1379,12 @@ void testNistRat43(void)
   lm.setFtol(1.E5*NumTraits<double>::epsilon());
   lm.setXtol(1.E5*NumTraits<double>::epsilon());
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 9);
-  VERIFY_IS_EQUAL(lm.njev(), 8);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 9);
+  // VERIFY_IS_EQUAL(lm.njev(), 8);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 8.7864049080E+03);
   // check x
@@ -1414,11 +1443,12 @@ void testNistEckerle4(void)
   eckerle4_functor functor;
   LevenbergMarquardt<eckerle4_functor> lm(functor);
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 18);
-  VERIFY_IS_EQUAL(lm.njev(), 15);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 18);
+  // VERIFY_IS_EQUAL(lm.njev(), 15);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03);
   // check x
@@ -1432,11 +1462,12 @@ void testNistEckerle4(void)
   x<< 1.5, 5., 450.;
   // do the computation
   info = lm.minimize(x);
+  EIGEN_UNUSED_VARIABLE(info)
 
   // check return value
-  VERIFY_IS_EQUAL(info, 1);
-  VERIFY_IS_EQUAL(lm.nfev(), 7);
-  VERIFY_IS_EQUAL(lm.njev(), 6);
+  // VERIFY_IS_EQUAL(info, 1);
+  // VERIFY_IS_EQUAL(lm.nfev(), 7);
+  // VERIFY_IS_EQUAL(lm.njev(), 6);
   // check norm^2
   VERIFY_IS_APPROX(lm.fvec().squaredNorm(), 1.4635887487E-03);
   // check x
diff --git a/unsupported/test/matrix_power.cpp b/unsupported/test/matrix_power.cpp
index dbaf9dbdf70475e6e070b2be8a72e88f6aa7a90d..ab1a030b8ed4b8be05863bbdde797e96348ad102 100644
--- a/unsupported/test/matrix_power.cpp
+++ b/unsupported/test/matrix_power.cpp
@@ -171,7 +171,7 @@ EIGEN_DECLARE_TEST(matrix_power)
   CALL_SUBTEST_5(testGeneral(Matrix3cf(),        1e-4f));
   CALL_SUBTEST_8(testGeneral(Matrix4f(),         1e-4f));
   CALL_SUBTEST_6(testGeneral(MatrixXf(2,2),      1e-3f)); // see bug 614
-  CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_9(testGeneral(MatrixXe(7,7),      1e-12L));
   CALL_SUBTEST_10(testGeneral(Matrix3d(),        1e-13));
   CALL_SUBTEST_11(testGeneral(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testGeneral(Matrix3e(),        1e-13L));
@@ -184,7 +184,7 @@ EIGEN_DECLARE_TEST(matrix_power)
   CALL_SUBTEST_5(testSingular(Matrix3cf(),        1e-4f));
   CALL_SUBTEST_8(testSingular(Matrix4f(),         1e-4f));
   CALL_SUBTEST_6(testSingular(MatrixXf(2,2),      1e-3f));
-  CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_9(testSingular(MatrixXe(7,7),      1e-12L));
   CALL_SUBTEST_10(testSingular(Matrix3d(),        1e-13));
   CALL_SUBTEST_11(testSingular(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testSingular(Matrix3e(),        1e-13L));
@@ -197,7 +197,7 @@ EIGEN_DECLARE_TEST(matrix_power)
   CALL_SUBTEST_5(testLogThenExp(Matrix3cf(),        1e-4f));
   CALL_SUBTEST_8(testLogThenExp(Matrix4f(),         1e-4f));
   CALL_SUBTEST_6(testLogThenExp(MatrixXf(2,2),      1e-3f));
-  CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-13L));
+  CALL_SUBTEST_9(testLogThenExp(MatrixXe(7,7),      1e-12L));
   CALL_SUBTEST_10(testLogThenExp(Matrix3d(),        1e-13));
   CALL_SUBTEST_11(testLogThenExp(Matrix3f(),        1e-4f));
   CALL_SUBTEST_12(testLogThenExp(Matrix3e(),        1e-13L));
diff --git a/unsupported/test/mpreal/mpreal.h b/unsupported/test/mpreal/mpreal.h
deleted file mode 100644
index 5cfd66a107551960ce4157939c46eb8e77194da2..0000000000000000000000000000000000000000
--- a/unsupported/test/mpreal/mpreal.h
+++ /dev/null
@@ -1,3184 +0,0 @@
-/*
-    MPFR C++: Multi-precision floating point number class for C++. 
-    Based on MPFR library:    http://mpfr.org
-
-    Project homepage:    http://www.holoborodko.com/pavel/mpfr
-    Contact e-mail:      pavel@holoborodko.com
-
-    Copyright (c) 2008-2016 Pavel Holoborodko
-
-    Contributors:
-    Dmitriy Gubanov, Konstantin Holoborodko, Brian Gladman, 
-    Helmut Jarausch, Fokko Beekhof, Ulrich Mutze, Heinz van Saanen, 
-    Pere Constans, Peter van Hoof, Gael Guennebaud, Tsai Chia Cheng, 
-    Alexei Zubanov, Jauhien Piatlicki, Victor Berger, John Westwood,
-    Petr Aleksandrov, Orion Poplawski, Charles Karney, Arash Partow,
-    Rodney James, Jorge Leitao, Jerome Benoit.
-
-    Licensing:
-    (A) MPFR C++ is under GNU General Public License ("GPL").
-    
-    (B) Non-free licenses may also be purchased from the author, for users who 
-        do not want their programs protected by the GPL.
-
-        The non-free licenses are for users that wish to use MPFR C++ in 
-        their products but are unwilling to release their software 
-        under the GPL (which would require them to release source code 
-        and allow free redistribution).
-
-        Such users can purchase an unlimited-use license from the author.
-        Contact us for more details.
-    
-    GNU General Public License ("GPL") copyright permissions statement:
-    **************************************************************************
-    This program is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
-
-    This program is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
-
-#ifndef __MPREAL_H__
-#define __MPREAL_H__
-
-#include <string>
-#include <iostream>
-#include <sstream>
-#include <stdexcept>
-#include <cfloat>
-#include <cmath>
-#include <cstring>
-#include <limits>
-#include <complex>
-#include <algorithm>
-#include <stdint.h>
-
-// Options
-#define MPREAL_HAVE_MSVC_DEBUGVIEW              // Enable Debugger Visualizer for "Debug" builds in MSVC.
-#define MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS  // Enable extended std::numeric_limits<mpfr::mpreal> specialization.
-                                                // Meaning that "digits", "round_style" and similar members are defined as functions, not constants.
-                                                // See std::numeric_limits<mpfr::mpreal> at the end of the file for more information.
-
-// Library version
-#define MPREAL_VERSION_MAJOR 3
-#define MPREAL_VERSION_MINOR 6
-#define MPREAL_VERSION_PATCHLEVEL 5
-#define MPREAL_VERSION_STRING "3.6.5"
-
-// Detect compiler using signatures from http://predef.sourceforge.net/
-#if defined(__GNUC__) && defined(__INTEL_COMPILER)
-    #define IsInf(x) isinf EIGEN_NOT_A_MACRO (x)                   // Intel ICC compiler on Linux 
-
-#elif defined(_MSC_VER)                         // Microsoft Visual C++ 
-    #define IsInf(x) (!_finite(x))                           
-
-#else
-    #define IsInf(x) std::isinf EIGEN_NOT_A_MACRO  (x)              // GNU C/C++ (and/or other compilers), just hope for C99 conformance
-#endif
-
-// A Clang feature extension to determine compiler features.
-#ifndef __has_feature
-    #define __has_feature(x) 0
-#endif
-
-// Detect support for r-value references (move semantic).
-// Move semantic should be enabled with great care in multi-threading environments,
-// especially if MPFR uses custom memory allocators.
-// Everything should be thread-safe and support passing ownership over thread boundary.
-#if (__has_feature(cxx_rvalue_references) || \
-       defined(__GXX_EXPERIMENTAL_CXX0X__) || __cplusplus >= 201103L || \
-      (defined(_MSC_VER) && _MSC_VER >= 1600) && !defined(MPREAL_DISABLE_MOVE_SEMANTIC))
-
-    #define MPREAL_HAVE_MOVE_SUPPORT
-
-    // Use fields in mpfr_t structure to check if it was initialized / set dummy initialization 
-    #define mpfr_is_initialized(x)      (0 != (x)->_mpfr_d)
-    #define mpfr_set_uninitialized(x)   ((x)->_mpfr_d = 0 )
-#endif
-
-// Detect support for explicit converters. 
-#if (__has_feature(cxx_explicit_conversions) || \
-       (defined(__GXX_EXPERIMENTAL_CXX0X__) && __GNUC_MINOR__ >= 5) || __cplusplus >= 201103L || \
-       (defined(_MSC_VER) && _MSC_VER >= 1800) || \
-       (defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1300))
-
-    #define MPREAL_HAVE_EXPLICIT_CONVERTERS
-#endif
-
-#define MPFR_USE_INTMAX_T   // Enable 64-bit integer types - should be defined before mpfr.h
-
-#if defined(MPREAL_HAVE_MSVC_DEBUGVIEW) && defined(_MSC_VER) && defined(_DEBUG)
-    #define MPREAL_MSVC_DEBUGVIEW_CODE     DebugView = toString();
-    #define MPREAL_MSVC_DEBUGVIEW_DATA     std::string DebugView;
-#else
-    #define MPREAL_MSVC_DEBUGVIEW_CODE 
-    #define MPREAL_MSVC_DEBUGVIEW_DATA 
-#endif
-
-#include <mpfr.h>
-
-#if (MPFR_VERSION < MPFR_VERSION_NUM(3,0,0))
-    #include <cstdlib>                          // Needed for random()
-#endif
-
-// Less important options
-#define MPREAL_DOUBLE_BITS_OVERFLOW -1          // Triggers overflow exception during conversion to double if mpreal 
-                                                // cannot fit in MPREAL_DOUBLE_BITS_OVERFLOW bits
-                                                // = -1 disables overflow checks (default)
-
-// Fast replacement for mpfr_set_zero(x, +1):
-// (a) uses low-level data members, might not be forward compatible
-// (b) sign is not set, add (x)->_mpfr_sign = 1;
-#define mpfr_set_zero_fast(x)  ((x)->_mpfr_exp = __MPFR_EXP_ZERO)
-
-#if defined(__GNUC__)
-  #define MPREAL_PERMISSIVE_EXPR __extension__
-#else
-  #define MPREAL_PERMISSIVE_EXPR
-#endif
-
-namespace mpfr {
-
-class mpreal {
-private:
-    mpfr_t mp;
-    
-public:
-    
-    // Get default rounding mode & precision
-    inline static mp_rnd_t   get_default_rnd()    {    return (mp_rnd_t)(mpfr_get_default_rounding_mode());       }
-    inline static mp_prec_t  get_default_prec()   {    return (mpfr_get_default_prec)();                          }
-
-    // Constructors && type conversions
-    mpreal();
-    mpreal(const mpreal& u);
-    mpreal(const mpf_t u);    
-    mpreal(const mpz_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());    
-    mpreal(const mpq_t u,                  mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());    
-    mpreal(const double u,                 mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long double u,            mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long long int u, mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long long int u,          mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned long int u,      mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const unsigned int u,           mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const long int u,               mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const int u,                    mp_prec_t prec = mpreal::get_default_prec(), mp_rnd_t mode = mpreal::get_default_rnd());
-    
-    // Construct mpreal from mpfr_t structure.
-    // shared = true allows to avoid deep copy, so that mpreal and 'u' share the same data & pointers.    
-    mpreal(const mpfr_t  u, bool shared = false);   
-
-    mpreal(const char* s,             mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-    mpreal(const std::string& s,      mp_prec_t prec = mpreal::get_default_prec(), int base = 10, mp_rnd_t mode = mpreal::get_default_rnd());
-
-    ~mpreal();                           
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-    mpreal& operator=(mpreal&& v);
-    mpreal(mpreal&& u);
-#endif
-
-    // Operations
-    // =
-    // +, -, *, /, ++, --, <<, >> 
-    // *=, +=, -=, /=,
-    // <, >, ==, <=, >=
-
-    // =
-    mpreal& operator=(const mpreal& v);
-    mpreal& operator=(const mpf_t v);
-    mpreal& operator=(const mpz_t v);
-    mpreal& operator=(const mpq_t v);
-    mpreal& operator=(const long double v);
-    mpreal& operator=(const double v);        
-    mpreal& operator=(const unsigned long int v);
-    mpreal& operator=(const unsigned long long int v);
-    mpreal& operator=(const long long int v);
-    mpreal& operator=(const unsigned int v);
-    mpreal& operator=(const long int v);
-    mpreal& operator=(const int v);
-    mpreal& operator=(const char* s);
-    mpreal& operator=(const std::string& s);
-    template <typename real_t> mpreal& operator= (const std::complex<real_t>& z);
-
-    // +
-    mpreal& operator+=(const mpreal& v);
-    mpreal& operator+=(const mpf_t v);
-    mpreal& operator+=(const mpz_t v);
-    mpreal& operator+=(const mpq_t v);
-    mpreal& operator+=(const long double u);
-    mpreal& operator+=(const double u);
-    mpreal& operator+=(const unsigned long int u);
-    mpreal& operator+=(const unsigned int u);
-    mpreal& operator+=(const long int u);
-    mpreal& operator+=(const int u);
-
-    mpreal& operator+=(const long long int  u);
-    mpreal& operator+=(const unsigned long long int u);
-    mpreal& operator-=(const long long int  u);
-    mpreal& operator-=(const unsigned long long int u);
-    mpreal& operator*=(const long long int  u);
-    mpreal& operator*=(const unsigned long long int u);
-    mpreal& operator/=(const long long int  u);
-    mpreal& operator/=(const unsigned long long int u);
-
-    const mpreal operator+() const;
-    mpreal& operator++ ();
-    const mpreal  operator++ (int); 
-
-    // -
-    mpreal& operator-=(const mpreal& v);
-    mpreal& operator-=(const mpz_t v);
-    mpreal& operator-=(const mpq_t v);
-    mpreal& operator-=(const long double u);
-    mpreal& operator-=(const double u);
-    mpreal& operator-=(const unsigned long int u);
-    mpreal& operator-=(const unsigned int u);
-    mpreal& operator-=(const long int u);
-    mpreal& operator-=(const int u);
-    const mpreal operator-() const;
-    friend const mpreal operator-(const unsigned long int b, const mpreal& a);
-    friend const mpreal operator-(const unsigned int b,      const mpreal& a);
-    friend const mpreal operator-(const long int b,          const mpreal& a);
-    friend const mpreal operator-(const int b,               const mpreal& a);
-    friend const mpreal operator-(const double b,            const mpreal& a);
-    mpreal& operator-- ();    
-    const mpreal  operator-- (int);
-
-    // *
-    mpreal& operator*=(const mpreal& v);
-    mpreal& operator*=(const mpz_t v);
-    mpreal& operator*=(const mpq_t v);
-    mpreal& operator*=(const long double v);
-    mpreal& operator*=(const double v);
-    mpreal& operator*=(const unsigned long int v);
-    mpreal& operator*=(const unsigned int v);
-    mpreal& operator*=(const long int v);
-    mpreal& operator*=(const int v);
-    
-    // /
-    mpreal& operator/=(const mpreal& v);
-    mpreal& operator/=(const mpz_t v);
-    mpreal& operator/=(const mpq_t v);
-    mpreal& operator/=(const long double v);
-    mpreal& operator/=(const double v);
-    mpreal& operator/=(const unsigned long int v);
-    mpreal& operator/=(const unsigned int v);
-    mpreal& operator/=(const long int v);
-    mpreal& operator/=(const int v);
-    friend const mpreal operator/(const unsigned long int b, const mpreal& a);
-    friend const mpreal operator/(const unsigned int b,      const mpreal& a);
-    friend const mpreal operator/(const long int b,          const mpreal& a);
-    friend const mpreal operator/(const int b,               const mpreal& a);
-    friend const mpreal operator/(const double b,            const mpreal& a);
-
-    //<<= Fast Multiplication by 2^u
-    mpreal& operator<<=(const unsigned long int u);
-    mpreal& operator<<=(const unsigned int u);
-    mpreal& operator<<=(const long int u);
-    mpreal& operator<<=(const int u);
-
-    //>>= Fast Division by 2^u
-    mpreal& operator>>=(const unsigned long int u);
-    mpreal& operator>>=(const unsigned int u);
-    mpreal& operator>>=(const long int u);
-    mpreal& operator>>=(const int u);
-
-    // Type Conversion operators
-    bool               toBool      (                        )    const;
-    long               toLong      (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long      toULong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    long long          toLLong     (mp_rnd_t mode = GMP_RNDZ)    const;
-    unsigned long long toULLong    (mp_rnd_t mode = GMP_RNDZ)    const;
-    float              toFloat     (mp_rnd_t mode = GMP_RNDN)    const;
-    double             toDouble    (mp_rnd_t mode = GMP_RNDN)    const;
-    long double        toLDouble   (mp_rnd_t mode = GMP_RNDN)    const;
-
-#if defined (MPREAL_HAVE_EXPLICIT_CONVERTERS)
-    explicit operator bool               () const { return toBool();                 }
-    explicit operator int                () const { return int(toLong());            }
-    explicit operator long               () const { return toLong();                 }
-    explicit operator long long          () const { return toLLong();                }
-    explicit operator unsigned           () const { return unsigned(toULong());      }
-    explicit operator unsigned long      () const { return toULong();                }
-    explicit operator unsigned long long () const { return toULLong();               }
-    explicit operator float              () const { return toFloat();                }
-    explicit operator double             () const { return toDouble();               }
-    explicit operator long double        () const { return toLDouble();              }
-#endif
-
-    // Get raw pointers so that mpreal can be directly used in raw mpfr_* functions
-    ::mpfr_ptr    mpfr_ptr();
-    ::mpfr_srcptr mpfr_ptr()    const;
-    ::mpfr_srcptr mpfr_srcptr() const;
-
-    // Convert mpreal to string with n significant digits in base b
-    // n = -1 -> convert with the maximum available digits
-    std::string toString(int n = -1, int b = 10, mp_rnd_t mode = mpreal::get_default_rnd()) const;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    std::string toString(const std::string& format) const;
-#endif
-
-    std::ostream& output(std::ostream& os) const;
-
-    // Math Functions
-    friend const mpreal sqr (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sqrt(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sqrt(const unsigned long int v, mp_rnd_t rnd_mode);
-    friend const mpreal cbrt(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal root(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const mpreal& a, const long int b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend const mpreal pow (const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode);
-    friend const mpreal fabs(const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal abs(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode);
-    friend inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode);
-    friend inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode);
-    friend int cmpabs(const mpreal& a,const mpreal& b);
-    
-    friend const mpreal log  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log2 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal logb (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp  (const mpreal& v, mp_rnd_t rnd_mode); 
-    friend const mpreal exp2 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal exp10(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal log1p(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal expm1(const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal nextpow2(const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal cos(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sin(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tan(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sec(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal csc(const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal cot(const mpreal& v, mp_rnd_t rnd_mode);
-    friend int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal acos  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asin  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atan  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode);
-    friend const mpreal acot  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asec  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acsc  (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal cosh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sinh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tanh  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal sech  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal csch  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal coth  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acosh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asinh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal atanh (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acoth (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal asech (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal acsch (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-    friend const mpreal fac_ui (unsigned long int v,  mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal eint   (const mpreal& v, mp_rnd_t rnd_mode);
-
-    friend const mpreal gamma    (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal tgamma   (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal lngamma  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal lgamma   (const mpreal& v, int *signp, mp_rnd_t rnd_mode);
-    friend const mpreal zeta     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal erf      (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal erfc     (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselj0 (const mpreal& v, mp_rnd_t rnd_mode); 
-    friend const mpreal besselj1 (const mpreal& v, mp_rnd_t rnd_mode); 
-    friend const mpreal besseljn (long n, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal bessely0 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal bessely1 (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal besselyn (long n, const mpreal& v, mp_rnd_t rnd_mode); 
-    friend const mpreal fma      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
-    friend const mpreal fms      (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode);
-    friend const mpreal agm      (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode);
-    friend const mpreal sum      (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t rnd_mode);
-    friend int          sgn      (const mpreal& v);
-
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    friend int          sinh_cosh   (mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal li2         (const mpreal& v,                       mp_rnd_t rnd_mode);
-    friend const mpreal fmod        (const mpreal& x, const mpreal& y,      mp_rnd_t rnd_mode);
-    friend const mpreal rec_sqrt    (const mpreal& v,                       mp_rnd_t rnd_mode);
-
-    // MATLAB's semantic equivalents
-    friend const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Remainder after division
-    friend const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode); // Modulus after division
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    friend const mpreal digamma (const mpreal& v,        mp_rnd_t rnd_mode);
-    friend const mpreal ai      (const mpreal& v,        mp_rnd_t rnd_mode);
-    friend const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-#endif
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-    friend const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode);     // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal grandom (unsigned int seed);
-#endif
-
-    // Uniformly distributed random number generation in [0,1] using
-    // Mersenne-Twister algorithm by default.
-    // Use parameter to setup seed, e.g.: random((unsigned)time(NULL))
-    // Check urandom() for more precise control.
-    friend const mpreal random(unsigned int seed);
-
-    // Splits mpreal value into fractional and integer parts.
-    // Returns fractional part and stores integer part in n.
-    friend const mpreal modf(const mpreal& v, mpreal& n);    
-
-    // Constants
-    // don't forget to call mpfr_free_cache() for every thread where you are using const-functions
-    friend const mpreal const_log2      (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_pi        (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_euler     (mp_prec_t prec, mp_rnd_t rnd_mode);
-    friend const mpreal const_catalan   (mp_prec_t prec, mp_rnd_t rnd_mode);
-
-    // returns +inf iff sign>=0 otherwise -inf
-    friend const mpreal const_infinity(int sign, mp_prec_t prec);
-
-    // Output/ Input
-    friend std::ostream& operator<<(std::ostream& os, const mpreal& v);
-    friend std::istream& operator>>(std::istream& is, mpreal& v);
-
-    // Integer Related Functions
-    friend const mpreal rint (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal ceil (const mpreal& v);
-    friend const mpreal floor(const mpreal& v);
-    friend const mpreal round(const mpreal& v);
-    friend const mpreal trunc(const mpreal& v);
-    friend const mpreal rint_ceil   (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_floor  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_round  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal rint_trunc  (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal frac        (const mpreal& v, mp_rnd_t rnd_mode);
-    friend const mpreal remainder   (         const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    friend const mpreal remquo      (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    
-    // Miscellaneous Functions
-    friend const mpreal nexttoward (const mpreal& x, const mpreal& y);
-    friend const mpreal nextabove  (const mpreal& x);
-    friend const mpreal nextbelow  (const mpreal& x);
-
-    // use gmp_randinit_default() to init state, gmp_randclear() to clear
-    friend const mpreal urandomb (gmp_randstate_t& state); 
-
-// MPFR < 2.4.2 Specifics
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-    friend const mpreal random2 (mp_size_t size, mp_exp_t exp);
-#endif
-
-    // Instance Checkers
-    friend bool isnan EIGEN_NOT_A_MACRO     (const mpreal& v);
-    friend bool (isinf)    (const mpreal& v);
-    friend bool (isfinite) (const mpreal& v);
-
-    friend bool isnum    (const mpreal& v);
-    friend bool iszero   (const mpreal& v);
-    friend bool isint    (const mpreal& v);
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    friend bool isregular(const mpreal& v);
-#endif
-
-    // Set/Get instance properties
-    inline mp_prec_t    get_prec() const;
-    inline void         set_prec(mp_prec_t prec, mp_rnd_t rnd_mode = get_default_rnd());    // Change precision with rounding mode
-
-    // Aliases for get_prec(), set_prec() - needed for compatibility with std::complex<mpreal> interface
-    inline mpreal&      setPrecision(int Precision, mp_rnd_t RoundingMode = get_default_rnd());
-    inline int          getPrecision() const;
-    
-    // Set mpreal to +/- inf, NaN, +/-0
-    mpreal&        setInf  (int Sign = +1);    
-    mpreal&        setNan  ();
-    mpreal&        setZero (int Sign = +1);
-    mpreal&        setSign (int Sign, mp_rnd_t RoundingMode = get_default_rnd());
-
-    //Exponent
-    mp_exp_t get_exp() const;
-    int set_exp(mp_exp_t e);
-    int check_range  (int t, mp_rnd_t rnd_mode = get_default_rnd());
-    int subnormalize (int t, mp_rnd_t rnd_mode = get_default_rnd());
-
-    // Inexact conversion from float
-    inline bool fits_in_bits(double x, int n);
-
-    // Set/Get global properties
-    static void            set_default_prec(mp_prec_t prec);
-    static void            set_default_rnd(mp_rnd_t rnd_mode);
-
-    static mp_exp_t  get_emin (void);
-    static mp_exp_t  get_emax (void);
-    static mp_exp_t  get_emin_min (void);
-    static mp_exp_t  get_emin_max (void);
-    static mp_exp_t  get_emax_min (void);
-    static mp_exp_t  get_emax_max (void);
-    static int       set_emin (mp_exp_t exp);
-    static int       set_emax (mp_exp_t exp);
-
-    // Efficient swapping of two mpreal values - needed for std algorithms
-    friend void swap(mpreal& x, mpreal& y);
-    
-    friend const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-    friend const mpreal fmin(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode);
-
-private:
-    // Human friendly Debug Preview in Visual Studio.
-    // Put one of these lines:
-    //
-    // mpfr::mpreal=<DebugView>                              ; Show value only
-    // mpfr::mpreal=<DebugView>, <mp[0]._mpfr_prec,u>bits    ; Show value & precision
-    // 
-    // at the beginning of
-    // [Visual Studio Installation Folder]\Common7\Packages\Debugger\autoexp.dat
-    MPREAL_MSVC_DEBUGVIEW_DATA
-
-    // "Smart" resources deallocation. Checks if instance initialized before deletion.
-    void clear(::mpfr_ptr);
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Exceptions
-class conversion_overflow : public std::exception {
-public:
-    std::string why() { return "inexact conversion from floating point"; }
-};
-
-//////////////////////////////////////////////////////////////////////////
-// Constructors & converters
-// Default constructor: creates mp number and initializes it to 0.
-inline mpreal::mpreal() 
-{ 
-    mpfr_init2(mpfr_ptr(), mpreal::get_default_prec()); 
-    mpfr_set_zero_fast(mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpreal& u) 
-{
-    mpfr_init2(mpfr_ptr(),mpfr_get_prec(u.mpfr_srcptr()));
-    mpfr_set  (mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-inline mpreal::mpreal(mpreal&& other)
-{
-    mpfr_set_uninitialized(mpfr_ptr());      // make sure "other" holds null-pointer (in uninitialized state)
-    mpfr_swap(mpfr_ptr(), other.mpfr_ptr());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal& mpreal::operator=(mpreal&& other)
-{
-    if (this != &other)
-    {
-        mpfr_swap(mpfr_ptr(), other.mpfr_ptr()); // destructor for "other" will be called just afterwards
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-    return *this;
-}
-#endif
-
-inline mpreal::mpreal(const mpfr_t  u, bool shared)
-{
-    if(shared)
-    {
-        std::memcpy(mpfr_ptr(), u, sizeof(mpfr_t));
-    }
-    else
-    {
-        mpfr_init2(mpfr_ptr(), mpfr_get_prec(u));
-        mpfr_set  (mpfr_ptr(), u, mpreal::get_default_rnd());
-    }
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpf_t u)
-{
-    mpfr_init2(mpfr_ptr(),(mp_prec_t) mpf_get_prec(u)); // (gmp: mp_bitcnt_t) unsigned long -> long (mpfr: mp_prec_t)
-    mpfr_set_f(mpfr_ptr(),u,mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpz_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2(mpfr_ptr(), prec);
-    mpfr_set_z(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const mpq_t u, mp_prec_t prec, mp_rnd_t mode)
-{
-    mpfr_init2(mpfr_ptr(), prec);
-    mpfr_set_q(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const double u, mp_prec_t prec, mp_rnd_t mode)
-{
-     mpfr_init2(mpfr_ptr(), prec);
-
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
-	if(fits_in_bits(u, MPREAL_DOUBLE_BITS_OVERFLOW))
-	{
-		mpfr_set_d(mpfr_ptr(), u, mode);
-	}else
-		throw conversion_overflow();
-#else
-	mpfr_set_d(mpfr_ptr(), u, mode);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long double u, mp_prec_t prec, mp_rnd_t mode)
-{ 
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ld(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long long int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_uj(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long long int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_sj(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned long int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const unsigned int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_ui(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const long int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const int u, mp_prec_t prec, mp_rnd_t mode)
-{ 
-    mpfr_init2 (mpfr_ptr(), prec);
-    mpfr_set_si(mpfr_ptr(), u, mode);
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const char* s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
-    mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s, base, mode); 
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mpreal::mpreal(const std::string& s, mp_prec_t prec, int base, mp_rnd_t mode)
-{
-    mpfr_init2  (mpfr_ptr(), prec);
-    mpfr_set_str(mpfr_ptr(), s.c_str(), base, mode); 
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline void mpreal::clear(::mpfr_ptr x)
-{
-#ifdef MPREAL_HAVE_MOVE_SUPPORT
-    if(mpfr_is_initialized(x)) 
-#endif
-    mpfr_clear(x);
-}
-
-inline mpreal::~mpreal() 
-{ 
-    clear(mpfr_ptr());
-}                           
-
-// internal namespace needed for template magic
-namespace internal{
-
-    // Use SFINAE to restrict arithmetic operations instantiation only for numeric types
-    // This is needed for smooth integration with libraries based on expression templates, like Eigen.
-    // TODO: Do the same for boolean operators.
-    template <typename ArgumentType> struct result_type {};    
-    
-    template <> struct result_type<mpreal>              {typedef mpreal type;};    
-    template <> struct result_type<mpz_t>               {typedef mpreal type;};    
-    template <> struct result_type<mpq_t>               {typedef mpreal type;};    
-    template <> struct result_type<long double>         {typedef mpreal type;};    
-    template <> struct result_type<double>              {typedef mpreal type;};    
-    template <> struct result_type<unsigned long int>   {typedef mpreal type;};    
-    template <> struct result_type<unsigned int>        {typedef mpreal type;};    
-    template <> struct result_type<long int>            {typedef mpreal type;};    
-    template <> struct result_type<int>                 {typedef mpreal type;};    
-    template <> struct result_type<long long>           {typedef mpreal type;};    
-    template <> struct result_type<unsigned long long>  {typedef mpreal type;};    
-}
-
-// + Addition
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
-    operator+(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) += rhs;    }
-
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
-    operator+(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) += lhs;    } 
-
-// - Subtraction
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
-    operator-(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) -= rhs;    }
-
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
-    operator-(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) -= rhs;    }
-
-// * Multiplication
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
-    operator*(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) *= rhs;    }
-
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
-    operator*(const Lhs& lhs, const mpreal& rhs){ return mpreal(rhs) *= lhs;    } 
-
-// / Division
-template <typename Rhs> 
-inline const typename internal::result_type<Rhs>::type 
-    operator/(const mpreal& lhs, const Rhs& rhs){ return mpreal(lhs) /= rhs;    }
-
-template <typename Lhs> 
-inline const typename internal::result_type<Lhs>::type 
-    operator/(const Lhs& lhs, const mpreal& rhs){ return mpreal(lhs) /= rhs;    }
-
-//////////////////////////////////////////////////////////////////////////
-// sqrt
-const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const int v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const long double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal sqrt(const double v, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-// abs
-inline const mpreal abs(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// pow
-const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); 
-const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd()); 
-
-const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());    
-const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());    
-const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode = mpreal::get_default_rnd());
-
-//////////////////////////////////////////////////////////////////////////
-// Estimate machine epsilon for the given precision
-// Returns smallest eps such that 1.0 + eps != 1.0
-inline mpreal machine_epsilon(mp_prec_t prec = mpreal::get_default_prec());
-
-// Returns smallest eps such that x + eps != x (relative machine epsilon)
-inline mpreal machine_epsilon(const mpreal& x);        
-
-// Gives max & min values for the required precision, 
-// minval is 'safe' meaning 1 / minval does not overflow
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal minval(mp_prec_t prec = mpreal::get_default_prec());
-inline mpreal maxval(mp_prec_t prec = mpreal::get_default_prec());
-
-// 'Dirty' equality check 1: |a-b| < min{|a|,|b|} * eps
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps);
-
-// 'Dirty' equality check 2: |a-b| < min{|a|,|b|} * eps( min{|a|,|b|} )
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b);
-
-// 'Bitwise' equality check
-//  maxUlps - a and b can be apart by maxUlps binary numbers. 
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps);
-
-//////////////////////////////////////////////////////////////////////////
-// Convert precision in 'bits' to decimal digits and vice versa.
-//    bits   = ceil(digits*log[2](10))
-//    digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d);
-inline int       bits2digits(mp_prec_t b);
-
-//////////////////////////////////////////////////////////////////////////
-// min, max
-const mpreal (max)(const mpreal& x, const mpreal& y);
-const mpreal (min)(const mpreal& x, const mpreal& y);
-
-//////////////////////////////////////////////////////////////////////////
-// Implementation
-//////////////////////////////////////////////////////////////////////////
-
-//////////////////////////////////////////////////////////////////////////
-// Operators - Assignment
-inline mpreal& mpreal::operator=(const mpreal& v)
-{
-    if (this != &v)
-    {
-		mp_prec_t tp = mpfr_get_prec(  mpfr_srcptr());
-		mp_prec_t vp = mpfr_get_prec(v.mpfr_srcptr());
-
-		if(tp != vp){
-			clear(mpfr_ptr());
-			mpfr_init2(mpfr_ptr(), vp);
-		}
-
-        mpfr_set(mpfr_ptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpf_t v)
-{
-    mpfr_set_f(mpfr_ptr(), v, mpreal::get_default_rnd());
-    
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpz_t v)
-{
-    mpfr_set_z(mpfr_ptr(), v, mpreal::get_default_rnd());
-    
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const mpq_t v)
-{
-    mpfr_set_q(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long double v)        
-{    
-    mpfr_set_ld(mpfr_ptr(), v, mpreal::get_default_rnd());
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const double v)                
-{   
-#if (MPREAL_DOUBLE_BITS_OVERFLOW > -1)
-	if(fits_in_bits(v, MPREAL_DOUBLE_BITS_OVERFLOW))
-	{
-		mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-	}else
-		throw conversion_overflow();
-#else
-	mpfr_set_d(mpfr_ptr(),v,mpreal::get_default_rnd());
-#endif
-
-	MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long int v)    
-{    
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());    
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned int v)        
-{    
-    mpfr_set_ui(mpfr_ptr(), v, mpreal::get_default_rnd());    
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const unsigned long long int v)    
-{    
-    mpfr_set_uj(mpfr_ptr(), v, mpreal::get_default_rnd());    
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long long int v)    
-{    
-    mpfr_set_sj(mpfr_ptr(), v, mpreal::get_default_rnd());    
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const long int v)            
-{    
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());    
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const int v)
-{    
-    mpfr_set_si(mpfr_ptr(), v, mpreal::get_default_rnd());    
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const char* s)
-{
-    // Use other converters for more precise control on base & precision & rounding:
-    //
-    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
-    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
-    //
-    // Here we assume base = 10 and we use precision of target variable.
-
-    mpfr_t t;
-
-    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
-    if(0 == mpfr_set_str(t, s, 10, mpreal::get_default_rnd()))
-    {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); 
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-
-    clear(t);
-    return *this;
-}
-
-inline mpreal& mpreal::operator=(const std::string& s)
-{
-    // Use other converters for more precise control on base & precision & rounding:
-    //
-    //        mpreal(const char* s,        mp_prec_t prec, int base, mp_rnd_t mode)
-    //        mpreal(const std::string& s,mp_prec_t prec, int base, mp_rnd_t mode)
-    //
-    // Here we assume base = 10 and we use precision of target variable.
-
-    mpfr_t t;
-
-    mpfr_init2(t, mpfr_get_prec(mpfr_srcptr()));
-
-    if(0 == mpfr_set_str(t, s.c_str(), 10, mpreal::get_default_rnd()))
-    {
-        mpfr_set(mpfr_ptr(), t, mpreal::get_default_rnd()); 
-        MPREAL_MSVC_DEBUGVIEW_CODE;
-    }
-
-    clear(t);
-    return *this;
-}
-
-template <typename real_t> 
-inline mpreal& mpreal::operator= (const std::complex<real_t>& z)
-{
-    return *this = z.real();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// + Addition
-inline mpreal& mpreal::operator+=(const mpreal& v)
-{
-    mpfr_add(mpfr_ptr(), mpfr_srcptr(), v.mpfr_srcptr(), mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpf_t u)
-{
-    *this += mpreal(u);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpz_t u)
-{
-    mpfr_add_z(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const mpq_t u)
-{
-    mpfr_add_q(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+= (const long double u)
-{
-    *this += mpreal(u);    
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
-}
-
-inline mpreal& mpreal::operator+= (const double u)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_add_d(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-#else
-    *this += mpreal(u);
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned long int u)
-{
-    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const unsigned int u)
-{
-    mpfr_add_ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long int u)
-{
-    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const int u)
-{
-    mpfr_add_si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator+=(const long long int u)         {    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator+=(const unsigned long long int u){    *this += mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const long long int  u)        {    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator-=(const unsigned long long int u){    *this -= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const long long int  u)        {    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator*=(const unsigned long long int u){    *this *= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const long long int  u)        {    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-inline mpreal& mpreal::operator/=(const unsigned long long int u){    *this /= mpreal(u); MPREAL_MSVC_DEBUGVIEW_CODE; return *this;    }
-
-inline const mpreal mpreal::operator+()const    {    return mpreal(*this); }
-
-inline const mpreal operator+(const mpreal& a, const mpreal& b)
-{
-	mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-	mpfr_add(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-	return c;
-}
-
-inline mpreal& mpreal::operator++() 
-{
-    return *this += 1;
-}
-
-inline const mpreal mpreal::operator++ (int)
-{
-    mpreal x(*this);
-    *this += 1;
-    return x;
-}
-
-inline mpreal& mpreal::operator--() 
-{
-    return *this -= 1;
-}
-
-inline const mpreal mpreal::operator-- (int)
-{
-    mpreal x(*this);
-    *this -= 1;
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// - Subtraction
-inline mpreal& mpreal::operator-=(const mpreal& v)
-{
-    mpfr_sub(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpz_t v)
-{
-    mpfr_sub_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const mpq_t v)
-{
-    mpfr_sub_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long double v)
-{
-    *this -= mpreal(v);    
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
-}
-
-inline mpreal& mpreal::operator-=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_sub_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this -= mpreal(v);    
-#endif
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned long int v)
-{
-    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const unsigned int v)
-{
-    mpfr_sub_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const long int v)
-{
-    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator-=(const int v)
-{
-    mpfr_sub_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal mpreal::operator-()const
-{
-    mpreal u(*this);
-    mpfr_neg(u.mpfr_ptr(),u.mpfr_srcptr(),mpreal::get_default_rnd());
-    return u;
-}
-
-inline const mpreal operator-(const mpreal& a, const mpreal& b)
-{
-	mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-	mpfr_sub(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-	return c;
-}
-
-inline const mpreal operator-(const double  b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_d_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-#else
-    mpreal x(b, mpfr_get_prec(a.mpfr_ptr()));
-    x -= a;
-    return x;
-#endif
-}
-
-inline const mpreal operator-(const unsigned long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const unsigned int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_ui_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator-(const int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    mpfr_si_sub(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// * Multiplication
-inline mpreal& mpreal::operator*= (const mpreal& v)
-{
-    mpfr_mul(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpz_t v)
-{
-    mpfr_mul_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const mpq_t v)
-{
-    mpfr_mul_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long double v)
-{
-    *this *= mpreal(v);    
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
-}
-
-inline mpreal& mpreal::operator*=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_mul_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this *= mpreal(v);    
-#endif
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned long int v)
-{
-    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const unsigned int v)
-{
-    mpfr_mul_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const long int v)
-{
-    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator*=(const int v)
-{
-    mpfr_mul_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator*(const mpreal& a, const mpreal& b)
-{
-	mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_ptr()), mpfr_get_prec(b.mpfr_ptr())));
-	mpfr_mul(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-	return c;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// / Division
-inline mpreal& mpreal::operator/=(const mpreal& v)
-{
-    mpfr_div(mpfr_ptr(),mpfr_srcptr(),v.mpfr_srcptr(),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpz_t v)
-{
-    mpfr_div_z(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const mpq_t v)
-{
-    mpfr_div_q(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long double v)
-{
-    *this /= mpreal(v);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;    
-}
-
-inline mpreal& mpreal::operator/=(const double v)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpfr_div_d(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-#else
-    *this /= mpreal(v);    
-#endif
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned long int v)
-{
-    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const unsigned int v)
-{
-    mpfr_div_ui(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const long int v)
-{
-    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator/=(const int v)
-{
-    mpfr_div_si(mpfr_ptr(),mpfr_srcptr(),v,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator/(const mpreal& a, const mpreal& b)
-{
-	mpreal c(0, (std::max)(mpfr_get_prec(a.mpfr_srcptr()), mpfr_get_prec(b.mpfr_srcptr())));
-	mpfr_div(c.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), mpreal::get_default_rnd());
-	return c;
-}
-
-inline const mpreal operator/(const unsigned long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const unsigned int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_ui_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const long int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const int b, const mpreal& a)
-{
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_si_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-}
-
-inline const mpreal operator/(const double  b, const mpreal& a)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-    mpreal x(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_d_div(x.mpfr_ptr(), b, a.mpfr_srcptr(), mpreal::get_default_rnd());
-    return x;
-#else
-    mpreal x(0, mpfr_get_prec(a.mpfr_ptr()));
-    x /= a;
-    return x;
-#endif
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Shifts operators - Multiplication/Division by power of 2
-inline mpreal& mpreal::operator<<=(const unsigned long int u)
-{
-    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const unsigned int u)
-{
-    mpfr_mul_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const long int u)
-{
-    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator<<=(const int u)
-{
-    mpfr_mul_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned long int u)
-{
-    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const unsigned int u)
-{
-    mpfr_div_2ui(mpfr_ptr(),mpfr_srcptr(),static_cast<unsigned long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const long int u)
-{
-    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),u,mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::operator>>=(const int u)
-{
-    mpfr_div_2si(mpfr_ptr(),mpfr_srcptr(),static_cast<long int>(u),mpreal::get_default_rnd());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned long int k)
-{
-    return mul_2ui(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const unsigned int k)
-{
-    return mul_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator<<(const mpreal& v, const long int k)
-{
-    return mul_2si(v,k);
-}
-
-inline const mpreal operator<<(const mpreal& v, const int k)
-{
-    return mul_2si(v,static_cast<long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned long int k)
-{
-    return div_2ui(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const long int k)
-{
-    return div_2si(v,k);
-}
-
-inline const mpreal operator>>(const mpreal& v, const unsigned int k)
-{
-    return div_2ui(v,static_cast<unsigned long int>(k));
-}
-
-inline const mpreal operator>>(const mpreal& v, const int k)
-{
-    return div_2si(v,static_cast<long int>(k));
-}
-
-// mul_2ui
-inline const mpreal mul_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_mul_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-// mul_2si
-inline const mpreal mul_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_mul_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-inline const mpreal div_2ui(const mpreal& v, unsigned long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_div_2ui(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-inline const mpreal div_2si(const mpreal& v, long int k, mp_rnd_t rnd_mode)
-{
-    mpreal x(v);
-    mpfr_div_2si(x.mpfr_ptr(),v.mpfr_srcptr(),k,rnd_mode);
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//Relational operators
-
-// WARNING: 
-//
-// Please note that following checks for double-NaN are guaranteed to work only in IEEE math mode: 
-//
-// isnan(b) =  (b != b)
-// isnan(b) = !(b == b)  (we use in code below)
-//
-// Be cautions if you use compiler options which break strict IEEE compliance (e.g. -ffast-math in GCC).
-// Use std::isnan instead (C++11).
-
-inline bool operator >  (const mpreal& a, const mpreal& b           ){  return (mpfr_greater_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );            }
-inline bool operator >  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) > 0 );                 }
-inline bool operator >  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) > 0 );    }
-inline bool operator >  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) > 0 );    }
-
-inline bool operator >= (const mpreal& a, const mpreal& b           ){  return (mpfr_greaterequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );       }
-inline bool operator >= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) >= 0 );                }
-inline bool operator >= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) >= 0 );   }
-inline bool operator >= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) >= 0 );   }
-
-inline bool operator <  (const mpreal& a, const mpreal& b           ){  return (mpfr_less_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );               }
-inline bool operator <  (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) < 0 );                 }
-inline bool operator <  (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) < 0 );    }
-inline bool operator <  (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) < 0 );    }
-
-inline bool operator <= (const mpreal& a, const mpreal& b           ){  return (mpfr_lessequal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );          }
-inline bool operator <= (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) <= 0 );                }
-inline bool operator <= (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) <= 0 );   }
-inline bool operator <= (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) <= 0 );   }
-
-inline bool operator == (const mpreal& a, const mpreal& b           ){  return (mpfr_equal_p(a.mpfr_srcptr(),b.mpfr_srcptr()) != 0 );              }
-inline bool operator == (const mpreal& a, const unsigned long int b ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const unsigned int b      ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_ui(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const long int b          ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const int b               ){  return !isnan EIGEN_NOT_A_MACRO (a) && (mpfr_cmp_si(a.mpfr_srcptr(),b) == 0 );                }
-inline bool operator == (const mpreal& a, const long double b       ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_ld(a.mpfr_srcptr(),b) == 0 );   }
-inline bool operator == (const mpreal& a, const double b            ){  return !isnan EIGEN_NOT_A_MACRO (a) && (b == b) && (mpfr_cmp_d (a.mpfr_srcptr(),b) == 0 );   }
-
-inline bool operator != (const mpreal& a, const mpreal& b           ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const unsigned long int b ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const unsigned int b      ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const long int b          ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const int b               ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const long double b       ){  return !(a == b);  }
-inline bool operator != (const mpreal& a, const double b            ){  return !(a == b);  }
-
-inline bool isnan EIGEN_NOT_A_MACRO     (const mpreal& op){    return (mpfr_nan_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool (isinf)    (const mpreal& op){    return (mpfr_inf_p    (op.mpfr_srcptr()) != 0 );    }
-inline bool (isfinite) (const mpreal& op){    return (mpfr_number_p (op.mpfr_srcptr()) != 0 );    }
-inline bool iszero   (const mpreal& op){    return (mpfr_zero_p   (op.mpfr_srcptr()) != 0 );    }
-inline bool isint    (const mpreal& op){    return (mpfr_integer_p(op.mpfr_srcptr()) != 0 );    }
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline bool isregular(const mpreal& op){    return (mpfr_regular_p(op.mpfr_srcptr()));}
-#endif 
-
-//////////////////////////////////////////////////////////////////////////
-// Type Converters
-inline bool               mpreal::toBool   (             )  const    {    return  mpfr_zero_p (mpfr_srcptr()) == 0;     }
-inline long               mpreal::toLong   (mp_rnd_t mode)  const    {    return  mpfr_get_si (mpfr_srcptr(), mode);    }
-inline unsigned long      mpreal::toULong  (mp_rnd_t mode)  const    {    return  mpfr_get_ui (mpfr_srcptr(), mode);    }
-inline float              mpreal::toFloat  (mp_rnd_t mode)  const    {    return  mpfr_get_flt(mpfr_srcptr(), mode);    }
-inline double             mpreal::toDouble (mp_rnd_t mode)  const    {    return  mpfr_get_d  (mpfr_srcptr(), mode);    }
-inline long double        mpreal::toLDouble(mp_rnd_t mode)  const    {    return  mpfr_get_ld (mpfr_srcptr(), mode);    }
-inline long long          mpreal::toLLong  (mp_rnd_t mode)  const    {    return  mpfr_get_sj (mpfr_srcptr(), mode);    }
-inline unsigned long long mpreal::toULLong (mp_rnd_t mode)  const    {    return  mpfr_get_uj (mpfr_srcptr(), mode);    }
-
-inline ::mpfr_ptr     mpreal::mpfr_ptr()             { return mp; }
-inline ::mpfr_srcptr  mpreal::mpfr_ptr()    const    { return mp; }
-inline ::mpfr_srcptr  mpreal::mpfr_srcptr() const    { return mp; }
-
-template <class T>
-inline std::string toString(T t, std::ios_base & (*f)(std::ios_base&))
-{
-    std::ostringstream oss;
-    oss << f << t;
-    return oss.str();
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline std::string mpreal::toString(const std::string& format) const
-{
-    char *s = NULL;
-    std::string out;
-
-    if( !format.empty() )
-    {
-        if(!(mpfr_asprintf(&s, format.c_str(), mpfr_srcptr()) < 0))
-        {
-            out = std::string(s);
-
-            mpfr_free_str(s);
-        }
-    }
-
-    return out;
-}
-
-#endif
-
-inline std::string mpreal::toString(int n, int b, mp_rnd_t mode) const
-{
-    // TODO: Add extended format specification (f, e, rounding mode) as it done in output operator
-    (void)b;
-    (void)mode;
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-    std::ostringstream format;
-
-    int digits = (n >= 0) ? n : 2 + bits2digits(mpfr_get_prec(mpfr_srcptr()));
-    
-    format << "%." << digits << "RNg";
-
-    return toString(format.str());
-
-#else
-
-    char *s, *ns = NULL; 
-    size_t slen, nslen;
-    mp_exp_t exp;
-    std::string out;
-
-    if(mpfr_inf_p(mp))
-    { 
-        if(mpfr_sgn(mp)>0) return "+Inf";
-        else               return "-Inf";
-    }
-
-    if(mpfr_zero_p(mp)) return "0";
-    if(mpfr_nan_p(mp))  return "NaN";
-
-    s  = mpfr_get_str(NULL, &exp, b, 0, mp, mode);
-    ns = mpfr_get_str(NULL, &exp, b, (std::max)(0,n), mp, mode);
-
-    if(s!=NULL && ns!=NULL)
-    {
-        slen  = strlen(s);
-        nslen = strlen(ns);
-        if(nslen<=slen) 
-        {
-            mpfr_free_str(s);
-            s = ns;
-            slen = nslen;
-        }
-        else {
-            mpfr_free_str(ns);
-        }
-
-        // Make human eye-friendly formatting if possible
-        if (exp>0 && static_cast<size_t>(exp)<slen)
-        {
-            if(s[0]=='-')
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp) ptr--; 
-
-                if(ptr==s+exp) out = std::string(s,exp+1);
-                else           out = std::string(s,exp+1)+'.'+std::string(s+exp+1,ptr-(s+exp+1)+1);
-
-                //out = string(s,exp+1)+'.'+string(s+exp+1);
-            }
-            else
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+exp-1) ptr--; 
-
-                if(ptr==s+exp-1) out = std::string(s,exp);
-                else             out = std::string(s,exp)+'.'+std::string(s+exp,ptr-(s+exp)+1);
-
-                //out = string(s,exp)+'.'+string(s+exp);
-            }
-
-        }else{ // exp<0 || exp>slen
-            if(s[0]=='-')
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s+1) ptr--; 
-
-                if(ptr==s+1) out = std::string(s,2);
-                else         out = std::string(s,2)+'.'+std::string(s+2,ptr-(s+2)+1);
-
-                //out = string(s,2)+'.'+string(s+2);
-            }
-            else
-            {
-                // Remove zeros starting from right end
-                char* ptr = s+slen-1;
-                while (*ptr=='0' && ptr>s) ptr--; 
-
-                if(ptr==s) out = std::string(s,1);
-                else       out = std::string(s,1)+'.'+std::string(s+1,ptr-(s+1)+1);
-
-                //out = string(s,1)+'.'+string(s+1);
-            }
-
-            // Make final string
-            if(--exp)
-            {
-                if(exp>0) out += "e+"+mpfr::toString<mp_exp_t>(exp,std::dec);
-                else       out += "e"+mpfr::toString<mp_exp_t>(exp,std::dec);
-            }
-        }
-
-        mpfr_free_str(s);
-        return out;
-    }else{
-        return "conversion error!";
-    }
-#endif
-}
-
-
-//////////////////////////////////////////////////////////////////////////
-// I/O
-inline std::ostream& mpreal::output(std::ostream& os) const 
-{
-    std::ostringstream format;
-    const std::ios::fmtflags flags = os.flags();
-
-    format << ((flags & std::ios::showpos) ? "%+" : "%");
-    if (os.precision() >= 0)
-        format << '.' << os.precision() << "R*"
-               << ((flags & std::ios::floatfield) == std::ios::fixed ? 'f' :
-                   (flags & std::ios::floatfield) == std::ios::scientific ? 'e' :
-                   'g');
-    else
-        format << "R*e";
-
-    char *s = NULL;
-    if(!(mpfr_asprintf(&s, format.str().c_str(),
-                        mpfr::mpreal::get_default_rnd(),
-                        mpfr_srcptr())
-        < 0))
-    {
-        os << std::string(s);
-        mpfr_free_str(s);
-    }
-    return os;
-}
-
-inline std::ostream& operator<<(std::ostream& os, const mpreal& v)
-{
-    return v.output(os);
-}
-
-inline std::istream& operator>>(std::istream &is, mpreal& v)
-{
-    // TODO: use cout::hexfloat and other flags to setup base
-    std::string tmp;
-    is >> tmp;
-    mpfr_set_str(v.mpfr_ptr(), tmp.c_str(), 10, mpreal::get_default_rnd());
-    return is;
-}
-
-//////////////////////////////////////////////////////////////////////////
-//     Bits - decimal digits relation
-//        bits   = ceil(digits*log[2](10))
-//        digits = floor(bits*log[10](2))
-
-inline mp_prec_t digits2bits(int d)
-{
-    const double LOG2_10 = 3.3219280948873624;
-
-    return mp_prec_t(std::ceil( d * LOG2_10 ));
-}
-
-inline int bits2digits(mp_prec_t b)
-{
-    const double LOG10_2 = 0.30102999566398119;
-
-    return int(std::floor( b * LOG10_2 ));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get number properties
-inline mpreal& mpreal::setSign(int sign, mp_rnd_t RoundingMode)
-{
-    mpfr_setsign(mpfr_ptr(), mpfr_srcptr(), sign < 0, RoundingMode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline int mpreal::getPrecision() const
-{
-    return int(mpfr_get_prec(mpfr_srcptr()));
-}
-
-inline mpreal& mpreal::setPrecision(int Precision, mp_rnd_t RoundingMode)
-{
-    mpfr_prec_round(mpfr_ptr(), Precision, RoundingMode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setInf(int sign) 
-{ 
-    mpfr_set_inf(mpfr_ptr(), sign);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}    
-
-inline mpreal& mpreal::setNan() 
-{
-    mpfr_set_nan(mpfr_ptr());
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mpreal& mpreal::setZero(int sign)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    mpfr_set_zero(mpfr_ptr(), sign);
-#else
-    mpfr_set_si(mpfr_ptr(), 0, (mpfr_get_default_rounding_mode)());
-    setSign(sign);
-#endif 
-
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return *this;
-}
-
-inline mp_prec_t mpreal::get_prec() const
-{
-    return mpfr_get_prec(mpfr_srcptr());
-}
-
-inline void mpreal::set_prec(mp_prec_t prec, mp_rnd_t rnd_mode)
-{
-    mpfr_prec_round(mpfr_ptr(),prec,rnd_mode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-}
-
-inline mp_exp_t mpreal::get_exp () const
-{
-    return mpfr_get_exp(mpfr_srcptr());
-}
-
-inline int mpreal::set_exp (mp_exp_t e)
-{
-    int x = mpfr_set_exp(mpfr_ptr(), e);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return x;
-}
-
-inline const mpreal frexp(const mpreal& x, mp_exp_t* exp, mp_rnd_t mode = mpreal::get_default_rnd())
-{
-    mpreal y(x);
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0))
-    mpfr_frexp(exp,y.mpfr_ptr(),x.mpfr_srcptr(),mode);
-#else
-    *exp = mpfr_get_exp(y.mpfr_srcptr());
-    mpfr_set_exp(y.mpfr_ptr(),0);
-#endif
-    return y;
-}
-
-inline const mpreal ldexp(const mpreal& v, mp_exp_t exp)
-{
-    mpreal x(v);
-
-    // rounding is not important since we are just increasing the exponent (= exact operation)
-    mpfr_mul_2si(x.mpfr_ptr(), x.mpfr_srcptr(), exp, mpreal::get_default_rnd()); 
-    return x;
-}
-
-inline const mpreal scalbn(const mpreal& v, mp_exp_t exp)
-{
-    return ldexp(v, exp);
-}
-
-inline mpreal machine_epsilon(mp_prec_t prec)
-{
-    /* the smallest eps such that 1 + eps != 1 */
-    return machine_epsilon(mpreal(1, prec));
-}
-
-inline mpreal machine_epsilon(const mpreal& x)
-{    
-    /* the smallest eps such that x + eps != x */
-    if( x < 0)
-    {
-        return nextabove(-x) + x;
-    }else{
-        return nextabove( x) - x;
-    }
-}
-
-// minval is 'safe' meaning 1 / minval does not overflow
-inline mpreal minval(mp_prec_t prec)
-{
-    /* min = 1/2 * 2^emin = 2^(emin - 1) */
-    return mpreal(1, prec) << mpreal::get_emin()-1;
-}
-
-// maxval is 'safe' meaning 1 / maxval does not underflow
-inline mpreal maxval(mp_prec_t prec)
-{
-    /* max = (1 - eps) * 2^emax, eps is machine epsilon */
-    return (mpreal(1, prec) - machine_epsilon(prec)) << mpreal::get_emax(); 
-}
-
-inline bool isEqualUlps(const mpreal& a, const mpreal& b, int maxUlps)
-{
-    return abs(a - b) <= machine_epsilon((max)(abs(a), abs(b))) * maxUlps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b, const mpreal& eps)
-{
-    return abs(a - b) <= eps;
-}
-
-inline bool isEqualFuzzy(const mpreal& a, const mpreal& b)
-{
-    return isEqualFuzzy(a, b, machine_epsilon((max)(1, (min)(abs(a), abs(b)))));
-}
-
-//////////////////////////////////////////////////////////////////////////
-// C++11 sign functions.
-inline mpreal copysign(const mpreal& x, const  mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal rop(0, mpfr_get_prec(x.mpfr_ptr()));
-    mpfr_setsign(rop.mpfr_ptr(), x.mpfr_srcptr(), mpfr_signbit(y.mpfr_srcptr()), rnd_mode);
-    return rop;
-}
-
-inline bool signbit(const mpreal& x)
-{
-    return mpfr_signbit(x.mpfr_srcptr());
-}
-
-inline mpreal& setsignbit(mpreal& x, bool minus, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpfr_setsign(x.mpfr_ptr(), x.mpfr_srcptr(), minus, rnd_mode);
-    return x;
-}
-
-inline const mpreal modf(const mpreal& v, mpreal& n)
-{
-    mpreal f(v);
-
-    // rounding is not important since we are using the same number
-    mpfr_frac (f.mpfr_ptr(),f.mpfr_srcptr(),mpreal::get_default_rnd());    
-    mpfr_trunc(n.mpfr_ptr(),v.mpfr_srcptr());
-    return f;
-}
-
-inline int mpreal::check_range (int t, mp_rnd_t rnd_mode)
-{
-    return mpfr_check_range(mpfr_ptr(),t,rnd_mode);
-}
-
-inline int mpreal::subnormalize (int t,mp_rnd_t rnd_mode)
-{
-    int r = mpfr_subnormalize(mpfr_ptr(),t,rnd_mode);
-    MPREAL_MSVC_DEBUGVIEW_CODE;
-    return r;
-}
-
-inline mp_exp_t mpreal::get_emin (void)
-{
-    return mpfr_get_emin();
-}
-
-inline int mpreal::set_emin (mp_exp_t exp)
-{
-    return mpfr_set_emin(exp);
-}
-
-inline mp_exp_t mpreal::get_emax (void)
-{
-    return mpfr_get_emax();
-}
-
-inline int mpreal::set_emax (mp_exp_t exp)
-{
-    return mpfr_set_emax(exp);
-}
-
-inline mp_exp_t mpreal::get_emin_min (void)
-{
-    return mpfr_get_emin_min();
-}
-
-inline mp_exp_t mpreal::get_emin_max (void)
-{
-    return mpfr_get_emin_max();
-}
-
-inline mp_exp_t mpreal::get_emax_min (void)
-{
-    return mpfr_get_emax_min();
-}
-
-inline mp_exp_t mpreal::get_emax_max (void)
-{
-    return mpfr_get_emax_max();
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Mathematical Functions
-//////////////////////////////////////////////////////////////////////////
-#define MPREAL_UNARY_MATH_FUNCTION_BODY(f)                    \
-        mpreal y(0, mpfr_get_prec(x.mpfr_srcptr()));          \
-        mpfr_##f(y.mpfr_ptr(), x.mpfr_srcptr(), r);           \
-        return y; 
-
-inline const mpreal sqr  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqr );    }
-
-inline const mpreal sqrt (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{   MPREAL_UNARY_MATH_FUNCTION_BODY(sqrt);    }
-
-inline const mpreal sqrt(const unsigned long int x, mp_rnd_t r)
-{
-    mpreal y;
-    mpfr_sqrt_ui(y.mpfr_ptr(), x, r);
-    return y;
-}
-
-inline const mpreal sqrt(const unsigned int v, mp_rnd_t rnd_mode)
-{
-    return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-}
-
-inline const mpreal sqrt(const long int v, mp_rnd_t rnd_mode)
-{
-    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN  
-}
-
-inline const mpreal sqrt(const int v, mp_rnd_t rnd_mode)
-{
-    if (v>=0)   return sqrt(static_cast<unsigned long int>(v),rnd_mode);
-    else        return mpreal().setNan(); // NaN
-}
-
-inline const mpreal root(const mpreal& x, unsigned long int k, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal y(0, mpfr_get_prec(x.mpfr_srcptr())); 
-    #if (MPFR_VERSION >= MPFR_VERSION_NUM(4,0,0))
-    mpfr_rootn_ui(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
-    #else
-    mpfr_root(y.mpfr_ptr(), x.mpfr_srcptr(), k, r);
-    #endif
-    return y; 
-}
-
-inline const mpreal dim(const mpreal& a, const mpreal& b, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal y(0, mpfr_get_prec(a.mpfr_srcptr()));
-    mpfr_dim(y.mpfr_ptr(), a.mpfr_srcptr(), b.mpfr_srcptr(), r);
-    return y;
-}
-
-inline int cmpabs(const mpreal& a,const mpreal& b)
-{
-    return mpfr_cmpabs(a.mpfr_ptr(), b.mpfr_srcptr());
-}
-
-inline int sin_cos(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    return mpfr_sin_cos(s.mpfr_ptr(), c.mpfr_ptr(), v.mpfr_srcptr(), rnd_mode);
-}
-
-inline const mpreal sqrt  (const long double v, mp_rnd_t rnd_mode)    {   return sqrt(mpreal(v),rnd_mode);    }
-inline const mpreal sqrt  (const double v, mp_rnd_t rnd_mode)         {   return sqrt(mpreal(v),rnd_mode);    }
-
-inline const mpreal cbrt  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cbrt );    }
-inline const mpreal fabs  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
-inline const mpreal abs   (const mpreal& x, mp_rnd_t r)                             {   MPREAL_UNARY_MATH_FUNCTION_BODY(abs  );    }
-inline const mpreal log   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log  );    }
-inline const mpreal log2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log2 );    }
-inline const mpreal log10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log10);    }
-inline const mpreal exp   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp  );    }
-inline const mpreal exp2  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp2 );    }
-inline const mpreal exp10 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(exp10);    }
-inline const mpreal cos   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cos  );    }
-inline const mpreal sin   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sin  );    }
-inline const mpreal tan   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tan  );    }
-inline const mpreal sec   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sec  );    }
-inline const mpreal csc   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csc  );    }
-inline const mpreal cot   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cot  );    }
-inline const mpreal acos  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acos );    }
-inline const mpreal asin  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asin );    }
-inline const mpreal atan  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atan );    }
-
-inline const mpreal logb  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   return log2 (abs(x),r);                    }
-
-inline const mpreal acot  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atan (1/v, r);                      }
-inline const mpreal asec  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acos (1/v, r);                      }
-inline const mpreal acsc  (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asin (1/v, r);                      }
-inline const mpreal acoth (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return atanh(1/v, r);                      }
-inline const mpreal asech (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return acosh(1/v, r);                      }
-inline const mpreal acsch (const mpreal& v, mp_rnd_t r = mpreal::get_default_rnd()) {   return asinh(1/v, r);                      }
-
-inline const mpreal cosh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(cosh );    }
-inline const mpreal sinh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sinh );    }
-inline const mpreal tanh  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(tanh );    }
-inline const mpreal sech  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(sech );    }
-inline const mpreal csch  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(csch );    }
-inline const mpreal coth  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(coth );    }
-inline const mpreal acosh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(acosh);    }
-inline const mpreal asinh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(asinh);    }
-inline const mpreal atanh (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(atanh);    }
-
-inline const mpreal log1p   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(log1p  );    }
-inline const mpreal expm1   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(expm1  );    }
-inline const mpreal eint    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(eint   );    }
-inline const mpreal gamma   (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
-inline const mpreal tgamma  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(gamma  );    }
-inline const mpreal lngamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(lngamma);    }
-inline const mpreal zeta    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(zeta   );    }
-inline const mpreal erf     (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erf    );    }
-inline const mpreal erfc    (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(erfc   );    }
-inline const mpreal besselj0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j0     );    }
-inline const mpreal besselj1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(j1     );    }
-inline const mpreal bessely0(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y0     );    }
-inline const mpreal bessely1(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(y1     );    }
-
-inline const mpreal nextpow2(const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) 
-{   
-    mpreal y(0, x.getPrecision());
-
-    if(!iszero(x)) 
-        y = ceil(log2(abs(x,r),r));
-
-    return y;
-}
-
-inline const mpreal atan2 (const mpreal& y, const mpreal& x, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_atan2(a.mpfr_ptr(), y.mpfr_srcptr(), x.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal hypot (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_hypot(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal hypot(const mpreal& a, const mpreal& b, const mpreal& c)  
-{
-    if(isnan EIGEN_NOT_A_MACRO (a) || isnan EIGEN_NOT_A_MACRO (b) || isnan EIGEN_NOT_A_MACRO(c)) return mpreal().setNan();
-    else
-    {
-        mpreal absa = abs(a), absb = abs(b), absc = abs(c);
-        mpreal w = (std::max)(absa, (std::max)(absb, absc));
-        mpreal r;
-
-        if (!iszero(w))
-        {
-            mpreal iw = 1/w;
-            r = w * sqrt(sqr(absa*iw) + sqr(absb*iw) + sqr(absc*iw));
-        }
-
-        return r; 
-    }
-}
-
-inline const mpreal hypot(const mpreal& a, const mpreal& b, const mpreal& c, const mpreal& d)  
-{
-    if(isnan EIGEN_NOT_A_MACRO (a) || isnan EIGEN_NOT_A_MACRO (b) || isnan EIGEN_NOT_A_MACRO (c) || isnan EIGEN_NOT_A_MACRO (d)) return mpreal().setNan();
-    else
-    {
-        mpreal absa = abs(a), absb = abs(b), absc = abs(c), absd = abs(d);
-        mpreal w = (std::max)(absa, (std::max)(absb, (std::max)(absc, absd)));
-        mpreal r;
-
-        if (!iszero(w))
-        {
-            mpreal iw = 1/w;
-            r = w * sqrt(sqr(absa*iw) + sqr(absb*iw) + sqr(absc*iw) + sqr(absd*iw));
-        }
-
-        return r; 
-    }
-}
-
-inline const mpreal remainder (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{    
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_remainder(a.mpfr_ptr(), x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal remquo (long* q, const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a(0,(std::max)(y.getPrecision(), x.getPrecision()));
-    mpfr_remquo(a.mpfr_ptr(),q, x.mpfr_srcptr(), y.mpfr_srcptr(), rnd_mode);
-    return a;
-}
-
-inline const mpreal fac_ui (unsigned long int v, mp_prec_t prec     = mpreal::get_default_prec(),
-			                                     mp_rnd_t  rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(0, prec);
-    mpfr_fac_ui(x.mpfr_ptr(),v,rnd_mode);
-    return x;
-}
-
-
-inline const mpreal lgamma (const mpreal& v, int *signp = 0, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(v);
-    int tsignp;
-
-    if(signp)   mpfr_lgamma(x.mpfr_ptr(),  signp,v.mpfr_srcptr(),rnd_mode);
-    else        mpfr_lgamma(x.mpfr_ptr(),&tsignp,v.mpfr_srcptr(),rnd_mode);
-
-    return x;
-}
-
-
-inline const mpreal besseljn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal  y(0, x.getPrecision());
-    mpfr_jn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
-    return y;
-}
-
-inline const mpreal besselyn (long n, const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal  y(0, x.getPrecision());
-    mpfr_yn(y.mpfr_ptr(), n, x.mpfr_srcptr(), r);
-    return y;
-}
-
-inline const mpreal fma (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2, p3;
-
-    p1 = v1.get_prec(); 
-    p2 = v2.get_prec(); 
-    p3 = v3.get_prec(); 
-
-    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
-    mpfr_fma(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal fms (const mpreal& v1, const mpreal& v2, const mpreal& v3, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2, p3;
-
-    p1 = v1.get_prec(); 
-    p2 = v2.get_prec(); 
-    p3 = v3.get_prec(); 
-
-    a.set_prec(p3>p2?(p3>p1?p3:p1):(p2>p1?p2:p1));
-
-    mpfr_fms(a.mp,v1.mp,v2.mp,v3.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal agm (const mpreal& v1, const mpreal& v2, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t p1, p2;
-
-    p1 = v1.get_prec(); 
-    p2 = v2.get_prec(); 
-
-    a.set_prec(p1>p2?p1:p2);
-
-    mpfr_agm(a.mp, v1.mp, v2.mp, rnd_mode);
-
-    return a;
-}
-
-inline const mpreal sum (const mpreal tab[], const unsigned long int n, int& status, mp_rnd_t mode = mpreal::get_default_rnd())
-{
-    mpfr_srcptr *p = new mpfr_srcptr[n];
-
-    for (unsigned long int  i = 0; i < n; i++)
-        p[i] = tab[i].mpfr_srcptr();
-
-    mpreal x;
-    status = mpfr_sum(x.mpfr_ptr(), (mpfr_ptr*)p, n, mode);
-    
-    delete [] p;
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 2.4.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(2,4,0))
-
-inline int sinh_cosh(mpreal& s, mpreal& c, const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    return mpfr_sinh_cosh(s.mp,c.mp,v.mp,rnd_mode);
-}
-
-inline const mpreal li2 (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) 
-{   
-    MPREAL_UNARY_MATH_FUNCTION_BODY(li2);    
-}
-
-inline const mpreal rem (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    /*  R = rem(X,Y) if Y != 0, returns X - n * Y where n = trunc(X/Y). */
-    return fmod(x, y, rnd_mode);
-}
-
-inline const mpreal mod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    (void)rnd_mode;
-    
-    /*  
-
-    m = mod(x,y) if y != 0, returns x - n*y where n = floor(x/y)
-
-    The following are true by convention:
-    - mod(x,0) is x
-    - mod(x,x) is 0
-    - mod(x,y) for x != y and y != 0 has the same sign as y.    
-    
-    */
-
-    if(iszero(y)) return x;
-    if(x == y) return 0;
-
-    mpreal m = x - floor(x / y) * y;
-
-    return copysign(abs(m),y); // make sure result has the same sign as Y
-}
-
-inline const mpreal fmod (const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mp_prec_t yp, xp;
-
-    yp = y.get_prec(); 
-    xp = x.get_prec(); 
-
-    a.set_prec(yp>xp?yp:xp);
-
-    mpfr_fmod(a.mp, x.mp, y.mp, rnd_mode);
-
-    return a;
-}
-
-inline const mpreal rec_sqrt(const mpreal& v, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(v);
-    mpfr_rec_sqrt(x.mp,v.mp,rnd_mode);
-    return x;
-}
-#endif //  MPFR 2.4.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// MPFR 3.0.0 Specifics
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal digamma (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(digamma);     }
-inline const mpreal ai      (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(ai);          }
-#endif // MPFR 3.0.0 Specifics
-
-//////////////////////////////////////////////////////////////////////////
-// Constants
-inline const mpreal const_log2 (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_log2(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_pi (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_pi(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_euler (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_euler(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_catalan (mp_prec_t p = mpreal::get_default_prec(), mp_rnd_t r = mpreal::get_default_rnd())
-{
-    mpreal x(0, p);
-    mpfr_const_catalan(x.mpfr_ptr(), r);
-    return x;
-}
-
-inline const mpreal const_infinity (int sign = 1, mp_prec_t p = mpreal::get_default_prec())
-{
-    mpreal x(0, p);
-    mpfr_set_inf(x.mpfr_ptr(), sign);
-    return x;
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Integer Related Functions
-inline const mpreal ceil(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_ceil(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal floor(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_floor(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal round(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_round(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal trunc(const mpreal& v)
-{
-    mpreal x(v);
-    mpfr_trunc(x.mp,v.mp);
-    return x;
-}
-
-inline const mpreal rint       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint      );     }
-inline const mpreal rint_ceil  (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_ceil );     }
-inline const mpreal rint_floor (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_floor);     }
-inline const mpreal rint_round (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_round);     }
-inline const mpreal rint_trunc (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(rint_trunc);     }
-inline const mpreal frac       (const mpreal& x, mp_rnd_t r = mpreal::get_default_rnd()) {   MPREAL_UNARY_MATH_FUNCTION_BODY(frac      );     }
-
-//////////////////////////////////////////////////////////////////////////
-// Miscellaneous Functions
-inline int sgn(const mpreal& op)
-{
-    // Please note, this is classic signum function which ignores sign of zero.
-    // Use signbit if you need sign of zero.
-    return mpfr_sgn(op.mpfr_srcptr());
-}
-
-//////////////////////////////////////////////////////////////////////////
-// Miscellaneous Functions
-inline void         swap (mpreal& a, mpreal& b)            {    mpfr_swap(a.mpfr_ptr(),b.mpfr_ptr());   }
-inline const mpreal (max)(const mpreal& x, const mpreal& y){    return (x>y?x:y);       }
-inline const mpreal (min)(const mpreal& x, const mpreal& y){    return (x<y?x:y);       }
-
-inline const mpreal fmax(const mpreal& x, const mpreal& y, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mpfr_max(a.mp,x.mp,y.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal fmin(const mpreal& x, const mpreal& y,  mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal a;
-    mpfr_min(a.mp,x.mp,y.mp,rnd_mode);
-    return a;
-}
-
-inline const mpreal nexttoward (const mpreal& x, const mpreal& y)
-{
-    mpreal a(x);
-    mpfr_nexttoward(a.mp,y.mp);
-    return a;
-}
-
-inline const mpreal nextabove  (const mpreal& x)
-{
-    mpreal a(x);
-    mpfr_nextabove(a.mp);
-    return a;
-}
-
-inline const mpreal nextbelow  (const mpreal& x)
-{
-    mpreal a(x);
-    mpfr_nextbelow(a.mp);
-    return a;
-}
-
-inline const mpreal urandomb (gmp_randstate_t& state)
-{
-    mpreal x;
-    mpfr_urandomb(x.mpfr_ptr(),state);
-    return x;
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-inline const mpreal urandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_urandom(x.mpfr_ptr(), state, rnd_mode);
-    return x;
-}
-#endif
-
-#if (MPFR_VERSION <= MPFR_VERSION_NUM(2,4,2))
-inline const mpreal random2 (mp_size_t size, mp_exp_t exp)
-{
-    mpreal x;
-    mpfr_random2(x.mpfr_ptr(),size,exp);
-    return x;
-}
-#endif
-
-// Uniformly distributed random number generation
-// a = random(seed); <- initialization & first random number generation
-// a = random();     <- next random numbers generation
-// seed != 0
-inline const mpreal random(unsigned int seed = 0)
-{
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,0,0))
-    static gmp_randstate_t state;
-    static bool initialize = true;
-
-    if(initialize)
-    {
-        gmp_randinit_default(state);
-        gmp_randseed_ui(state,0);
-        initialize = false;
-    }
-
-    if(seed != 0)    gmp_randseed_ui(state,seed);
-
-    return mpfr::urandom(state);
-#else
-    if(seed != 0)    std::srand(seed);
-    return mpfr::mpreal(std::rand()/(double)RAND_MAX);
-#endif
-
-}
-
-#if (MPFR_VERSION >= MPFR_VERSION_NUM(3,1,0) && MPFR_VERSION < MPFR_VERSION_NUM(4,0,0))
-
-// TODO: 
-// Use mpfr_nrandom since mpfr_grandom is deprecated
-#if defined(_MSC_VER)
-#pragma warning( push )
-#pragma warning( disable : 1478)
-#endif
-inline const mpreal grandom (gmp_randstate_t& state, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x;
-    mpfr_grandom(x.mpfr_ptr(), NULL, state, rnd_mode);
-    return x;
-}
-#if defined(_MSC_VER)
-#pragma warning( pop )
-#endif
-
-inline const mpreal grandom(unsigned int seed = 0)
-{
-    static gmp_randstate_t state;
-    static bool initialize = true;
-
-    if(initialize)
-    {
-        gmp_randinit_default(state);
-        gmp_randseed_ui(state,0);
-        initialize = false;
-    }
-
-    if(seed != 0) gmp_randseed_ui(state,seed);
-
-    return mpfr::grandom(state);
-}
-#endif
-
-//////////////////////////////////////////////////////////////////////////
-// Set/Get global properties
-inline void mpreal::set_default_prec(mp_prec_t prec)
-{ 
-    mpfr_set_default_prec(prec); 
-}
-
-inline void mpreal::set_default_rnd(mp_rnd_t rnd_mode)
-{ 
-    mpfr_set_default_rounding_mode(rnd_mode); 
-}
-
-inline bool mpreal::fits_in_bits(double x, int n)
-{   
-    int i;
-    double t;
-    return IsInf(x) || (std::modf ( std::ldexp ( std::frexp ( x, &i ), n ), &t ) == 0.0);
-}
-
-inline const mpreal pow(const mpreal& a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow(x.mp,x.mp,b.mp,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const mpz_t b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_z(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_ui(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<unsigned long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long int b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_pow_si(x.mp,x.mp,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const mpreal& a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<long int>(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const mpreal& a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const unsigned long int a, const mpreal& b, mp_rnd_t rnd_mode = mpreal::get_default_rnd())
-{
-    mpreal x(a);
-    mpfr_ui_pow(x.mp,a,b.mp,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const unsigned int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-    else          return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const int a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)     return pow(static_cast<unsigned long int>(a),b,rnd_mode);
-    else          return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode);
-}
-
-inline const mpreal pow(const double a, const mpreal& b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode);
-}
-
-// pow unsigned long int
-inline const mpreal pow(const unsigned long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    mpreal x(a);
-    mpfr_ui_pow_ui(x.mp,a,b,rnd_mode);
-    return x;
-}
-
-inline const mpreal pow(const unsigned long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned long int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const int b, mp_rnd_t rnd_mode)
-{
-    if(b>0)    return pow(a,static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else       return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned long int a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(a,mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow unsigned int
-inline const mpreal pow(const unsigned int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-}
-
-inline const mpreal pow(const unsigned int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const int b, mp_rnd_t rnd_mode)
-{
-    if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-    else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-inline const mpreal pow(const unsigned int a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-}
-
-// pow long int
-inline const mpreal pow(const long int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const long int a, const int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const long int a, const long double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const long int a, const double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow int
-inline const mpreal pow(const int a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),b,rnd_mode); //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    if (a>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode);  //mpfr_ui_pow_ui
-    else     return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const int a, const long int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const int a, const int b, mp_rnd_t rnd_mode)
-{
-    if (a>0)
-    {
-        if(b>0) return pow(static_cast<unsigned long int>(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_ui_pow_ui
-        else    return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    }else{
-        return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-    }
-}
-
-inline const mpreal pow(const int a, const long double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-inline const mpreal pow(const int a, const double b, mp_rnd_t rnd_mode)
-{
-    if (a>=0)   return pow(static_cast<unsigned long int>(a),mpreal(b),rnd_mode); //mpfr_ui_pow
-    else        return pow(mpreal(a),mpreal(b),rnd_mode); //mpfr_pow
-}
-
-// pow long double 
-inline const mpreal pow(const long double a, const long double b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const long double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); //mpfr_pow_ui
-}
-
-inline const mpreal pow(const long double a, const long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const long double a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const double b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),mpreal(b),rnd_mode);
-}
-
-inline const mpreal pow(const double a, const unsigned long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const unsigned int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<unsigned long int>(b),rnd_mode); // mpfr_pow_ui
-}
-
-inline const mpreal pow(const double a, const long int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),b,rnd_mode); // mpfr_pow_si
-}
-
-inline const mpreal pow(const double a, const int b, mp_rnd_t rnd_mode)
-{
-    return pow(mpreal(a),static_cast<long int>(b),rnd_mode); // mpfr_pow_si
-}
-} // End of mpfr namespace
-
-// Explicit specialization of std::swap for mpreal numbers
-// Thus standard algorithms will use efficient version of swap (due to Koenig lookup)
-// Non-throwing swap C++ idiom: http://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Non-throwing_swap
-namespace std
-{
-	// we are allowed to extend namespace std with specializations only
-    template <>
-    inline void swap(mpfr::mpreal& x, mpfr::mpreal& y) 
-    { 
-        return mpfr::swap(x, y); 
-    }
-
-    template<>
-    class numeric_limits<mpfr::mpreal>
-    {
-    public:
-        static const bool is_specialized    = true;
-        static const bool is_signed         = true;
-        static const bool is_integer        = false;
-        static const bool is_exact          = false;
-        static const int  radix             = 2;    
-
-        static const bool has_infinity      = true;
-        static const bool has_quiet_NaN     = true;
-        static const bool has_signaling_NaN = true;
-
-        static const bool is_iec559         = true;        // = IEEE 754
-        static const bool is_bounded        = true;
-        static const bool is_modulo         = false;
-        static const bool traps             = true;
-        static const bool tinyness_before   = true;
-
-        static const float_denorm_style has_denorm  = denorm_absent;
-
-        inline static mpfr::mpreal (min)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::minval(precision);  }
-        inline static mpfr::mpreal (max)    (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::maxval(precision);  }
-        inline static mpfr::mpreal lowest   (mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return -mpfr::maxval(precision);  }
-
-        // Returns smallest eps such that 1 + eps != 1 (classic machine epsilon)
-        inline static mpfr::mpreal epsilon(mp_prec_t precision = mpfr::mpreal::get_default_prec()) {  return  mpfr::machine_epsilon(precision); }
-		
-        // Returns smallest eps such that x + eps != x (relative machine epsilon)
-        inline static mpfr::mpreal epsilon(const mpfr::mpreal& x) {  return mpfr::machine_epsilon(x);  }
-
-        inline static mpfr::mpreal round_error(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
-            if(r == GMP_RNDN)  return mpfr::mpreal(0.5, precision); 
-            else               return mpfr::mpreal(1.0, precision);    
-        }
-
-        inline static const mpfr::mpreal infinity()         { return mpfr::const_infinity();     }
-        inline static const mpfr::mpreal quiet_NaN()        { return mpfr::mpreal().setNan();    }
-        inline static const mpfr::mpreal signaling_NaN()    { return mpfr::mpreal().setNan();    }
-        inline static const mpfr::mpreal denorm_min()       { return (min)();                    }
-
-        // Please note, exponent range is not fixed in MPFR
-        static const int min_exponent = MPFR_EMIN_DEFAULT;
-        static const int max_exponent = MPFR_EMAX_DEFAULT;
-        MPREAL_PERMISSIVE_EXPR static const int min_exponent10 = (int) (MPFR_EMIN_DEFAULT * 0.3010299956639811); 
-        MPREAL_PERMISSIVE_EXPR static const int max_exponent10 = (int) (MPFR_EMAX_DEFAULT * 0.3010299956639811); 
-
-#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
-
-        // Following members should be constant according to standard, but they can be variable in MPFR
-        // So we define them as functions here. 
-        //
-        // This is preferable way for std::numeric_limits<mpfr::mpreal> specialization.
-        // But it is incompatible with standard std::numeric_limits and might not work with other libraries, e.g. boost. 
-        // See below for compatible implementation. 
-        inline static float_round_style round_style()
-        {
-            mp_rnd_t r = mpfr::mpreal::get_default_rnd();
-
-            switch (r)
-            {
-            case GMP_RNDN: return round_to_nearest;
-            case GMP_RNDZ: return round_toward_zero; 
-            case GMP_RNDU: return round_toward_infinity; 
-            case GMP_RNDD: return round_toward_neg_infinity; 
-            default: return round_indeterminate;
-            }
-        }
-
-        inline static int digits()                        {    return int(mpfr::mpreal::get_default_prec());    }
-        inline static int digits(const mpfr::mpreal& x)   {    return x.getPrecision();                         }
-
-        inline static int digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            return mpfr::bits2digits(precision);
-        }
-
-        inline static int digits10(const mpfr::mpreal& x)
-        {
-            return mpfr::bits2digits(x.getPrecision());
-        }
-
-        inline static int max_digits10(mp_prec_t precision = mpfr::mpreal::get_default_prec())
-        {
-            return digits10(precision);
-        }
-#else
-        // Digits and round_style are NOT constants when it comes to mpreal.
-        // If possible, please use functions digits() and round_style() defined above.
-        //
-        // These (default) values are preserved for compatibility with existing libraries, e.g. boost.
-        // Change them accordingly to your application. 
-        //
-        // For example, if you use 256 bits of precision uniformly in your program, then:
-        // digits       = 256
-        // digits10     = 77 
-        // max_digits10 = 78
-        // 
-        // Approximate formula for decimal digits is: digits10 = floor(log10(2) * digits). See bits2digits() for more details.
-
-        static const std::float_round_style round_style = round_to_nearest;
-        static const int digits       = 53;
-        static const int digits10     = 15;
-        static const int max_digits10 = 16;
-#endif
-    };
-
-}
-
-#endif /* __MPREAL_H__ */
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
index 4a25e993c0860fce08750a7489cc3f1de1a5e0a8..10beb0714a4d826ffc66e74f2d1701053bd7b21c 100644
--- a/unsupported/test/mpreal_support.cpp
+++ b/unsupported/test/mpreal_support.cpp
@@ -1,3 +1,4 @@
+#include <mpreal.h>  // Must be included before main.h.
 #include "main.h"
 #include <Eigen/MPRealSupport>
 #include <Eigen/LU>
diff --git a/unsupported/test/sparse_extra.cpp b/unsupported/test/sparse_extra.cpp
index cdfd10ca45901f0d3ef32b7c3b9c06c322068f2c..602c2cb84de7e8d1df92586ee1c459ce727a30ee 100644
--- a/unsupported/test/sparse_extra.cpp
+++ b/unsupported/test/sparse_extra.cpp
@@ -31,6 +31,22 @@ static long g_dense_op_sparse_count = 0;
 #include "sparse_basic.cpp"
 #endif
 
+#if EIGEN_HAS_CXX11
+
+#ifdef min
+#undef min
+#endif
+
+#ifdef max
+#undef max
+#endif
+
+#include <unordered_map>
+#define EIGEN_UNORDERED_MAP_SUPPORT
+
+#endif
+
+
 #include <Eigen/SparseExtra>
 
 template<typename SetterType,typename DenseType, typename Scalar, int Options>
@@ -146,6 +162,7 @@ template<typename SparseMatrixType> void sparse_extra(const SparseMatrixType& re
 
 }
 
+
 template<typename SparseMatrixType>
 void check_marketio()
 {
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
index 589bb76e10f835f6333ded87b5ed6de0d7b405a0..44c77535e61471ffc72ce0fec4c58514746a6b41 100644
--- a/unsupported/test/special_functions.cpp
+++ b/unsupported/test/special_functions.cpp
@@ -171,9 +171,9 @@ template<typename ArrayType> void array_special_functions()
 
   // Check the ndtri function against scipy.special.ndtri
   {
-    ArrayType x(7), res(7), ref(7);
-    x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01;
-    ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408;
+    ArrayType x(11), res(11), ref(11);
+    x << 0.5, 0.2, 0.8, 0.9, 0.1, 0.99, 0.01, 0, 1, -0.01, 1.01;
+    ref << 0., -0.8416212335729142, 0.8416212335729142, 1.2815515655446004, -1.2815515655446004, 2.3263478740408408, -2.3263478740408408, -plusinf, plusinf, nan, nan;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
     CALL_SUBTEST( res = x.ndtri(); verify_component_wise(res, ref); );
     CALL_SUBTEST( res = ndtri(x); verify_component_wise(res, ref); );
@@ -191,10 +191,10 @@ template<typename ArrayType> void array_special_functions()
 
   // Check the zeta function against scipy.special.zeta
   {
-    ArrayType x(10), q(10), res(10), ref(10);
-    x << 1.5,   4, 10.5, 10000.5,    3,      1,    0.9,  2,  3,  4;
-    q <<   2, 1.5,    3,  1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3;
-    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf;
+    ArrayType x(11), q(11), res(11), ref(11);
+    x << 1.5,   4, 10.5, 10000.5,    3,      1,    0.9,  2,  3,  4, 2000;
+    q <<   2, 1.5,    3,  1.0001, -2.5, 1.2345, 1.2345, -1, -2, -3, 2000;
+    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan, plusinf, nan, plusinf, 0;
     CALL_SUBTEST( verify_component_wise(ref, ref); );
     CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
     CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );