From 2a671c388e054f38e95a442a3007eee5542bea62 Mon Sep 17 00:00:00 2001
From: Antonio Sanchez <cantonios@google.com>
Date: Thu, 9 Oct 2025 09:27:31 -0700
Subject: [PATCH] Add workaround for using std::fma for scalar multiply-add.

This is mainly to provide backward-compatibility.  The new macro
should not be used in new usages (or generally if avoidable).

Background: Eigen introduced then removed several uses of
`std::fma` for scalar multiply-add operations.  It was added
to increase precision and boost performance on systems that
support FMA in hardware.  But it turned out to significantly
slow down multiply-adds on systems that do not: 2-3x for intel CPUs,
and 30x for WASM builds.  We then limited the usage to only cases
where hardware FMA is available.  This ensures consistency
between vectorized and non-vectorized paths, and keeps the
higher precision only when it will not affect performance.

Unfortunately, several projects seem to rely on the intermediate
behavior in new tests where `std::fma` is used but do not
build with FMA hardware instructions available.  These now break.

To ease the transition, we introduce this temporary flag.
---
 Eigen/src/Core/MathFunctions.h |  4 +---
 Eigen/src/Core/util/Macros.h   | 20 ++++++++++++++++++++
 doc/PreprocessorDirectives.dox | 11 ++++-------
 3 files changed, 25 insertions(+), 10 deletions(-)
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 155fdad20..5e36ce84d 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -1004,8 +1004,7 @@ struct madd_impl {
   }
 };
 
-// Use FMA if there is a single CPU instruction.
-#ifdef EIGEN_VECTORIZE_FMA
+#if EIGEN_SCALAR_MADD_USE_FMA
 template <typename Scalar>
 struct madd_impl<Scalar, std::enable_if_t<has_fma<Scalar>::value>> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& x, const Scalar& y, const Scalar& z) {
@@ -1927,7 +1926,6 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar arithmetic_shift_right(const Scalar
   return bit_cast<Scalar, SignedScalar>(bit_cast<SignedScalar, Scalar>(a) >> n);
 }
 
-// Otherwise, rely on template implementation.
 template <typename Scalar>
 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar fma(const Scalar& x, const Scalar& y, const Scalar& z) {
   return internal::fma_impl<Scalar>::run(x, y, z);
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index db4a63089..dad367169 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -52,6 +52,26 @@
 #define EIGEN_STACK_ALLOCATION_LIMIT 131072
 #endif
 
+/* Specify whether to use std::fma for scalar multiply-add instructions.
+ *
+ * On machines that have FMA as a single instruction, this will generally
+ * improve precision without significant performance implications.
+ *
+ * Without a single instruction, performance has been found to be reduced 2-3x
+ * on Intel CPUs, and up to 30x for WASM.
+ *
+ * If unspecified, defaults to using FMA if hardware support is available.
+ * The default should be used in most cases to ensure consistency between
+ * vectorized and non-vectorized paths.
+ */
+#ifndef EIGEN_SCALAR_MADD_USE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
+#define EIGEN_SCALAR_MADD_USE_FMA 1
+#else
+#define EIGEN_SCALAR_MADD_USE_FMA 0
+#endif
+#endif
+
 //------------------------------------------------------------------------------------------
 // Compiler identification, EIGEN_COMP_*
 //------------------------------------------------------------------------------------------
diff --git a/doc/PreprocessorDirectives.dox b/doc/PreprocessorDirectives.dox
index 27ae531f6..f4af90744 100644
--- a/doc/PreprocessorDirectives.dox
+++ b/doc/PreprocessorDirectives.dox
@@ -18,9 +18,6 @@ one option, and other parts (or libraries that you use) are compiled with anothe
 fail to link or exhibit subtle bugs. Nevertheless, these options can be useful for people who know what they
 are doing.
 
- - \b EIGEN2_SUPPORT and \b EIGEN2_SUPPORT_STAGEnn_xxx are disabled starting from the 3.3 release.
-   Defining one of these will raise a compile-error. If you need to compile Eigen2 code,
-   <a href="http://eigen.tuxfamily.org/index.php?title=Eigen2">check this site</a>.
  - \b EIGEN_DEFAULT_DENSE_INDEX_TYPE - the type for column and row indices in matrices, vectors and array
    (DenseBase::Index). Set to \c std::ptrdiff_t by default.
  - \b EIGEN_DEFAULT_IO_FORMAT - the IOFormat to use when printing a matrix if no %IOFormat is specified.
@@ -44,7 +41,7 @@ are doing.
    preferable. Not defined by default.
    \warning See the documentation of \c EIGEN_INITIALIZE_MATRICES_BY_ZERO for a discussion on a limitations
    of these macros when applied to \c 1x1, \c 1x2, and \c 2x1 fixed-size matrices.
- - \b EIGEN_NO_AUTOMATIC_RESIZING - if defined, the matrices (or arrays) on both sides of an assignment 
+ - \b EIGEN_NO_AUTOMATIC_RESIZING - if defined, the matrices (or arrays) on both sides of an assignment
    <tt>a = b</tt> have to be of the same size; otherwise, %Eigen automatically resizes \c a so that it is of
    the correct size. Not defined by default.
 
@@ -72,8 +69,8 @@ The %Eigen library contains many assertions to guard against programming errors,
 run time. However, these assertions do cost time and can thus be turned off.
 
  - \b EIGEN_NO_DEBUG - disables %Eigen's assertions if defined. Not defined by default, unless the
-   \c NDEBUG macro is defined (this is a standard C++ macro which disables all asserts). 
- - \b EIGEN_NO_STATIC_ASSERT - if defined, compile-time static assertions are replaced by runtime assertions; 
+   \c NDEBUG macro is defined (this is a standard C++ macro which disables all asserts).
+ - \b EIGEN_NO_STATIC_ASSERT - if defined, compile-time static assertions are replaced by runtime assertions;
    this saves compilation time. Not defined by default.
  - \b eigen_assert - macro with one argument that is used inside %Eigen for assertions. By default, it is
    basically defined to be \c assert, which aborts the program if the assertion is violated. Redefine this
@@ -90,7 +87,7 @@ run time. However, these assertions do cost time and can thus be turned off.
  Let us emphasize that \c EIGEN_MAX_*_ALIGN_BYTES define only a desirable upper bound. In practice data is aligned to largest power-of-two common divisor of \c EIGEN_MAX_STATIC_ALIGN_BYTES and the size of the data, such that memory is not wasted.
  - \b \c EIGEN_DONT_PARALLELIZE - if defined, this disables multi-threading. This is only relevant if you enabled OpenMP.
    See \ref TopicMultiThreading for details.
- - \b \c EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless 
+ - \b \c EIGEN_DONT_VECTORIZE - disables explicit vectorization when defined. Not defined by default, unless
    alignment is disabled by %Eigen's platform test or the user defining \c EIGEN_DONT_ALIGN.
  - \b \c EIGEN_UNALIGNED_VECTORIZE - disables/enables vectorization with unaligned stores. Default is 1 (enabled).
    If set to 0 (disabled), then expression for which the destination cannot be aligned are not vectorized (e.g., unaligned
-- 
GitLab