From f8191848ee3ad9599e5393299508b82886d2db44 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 4 Aug 2025 13:07:56 -0700 Subject: [PATCH 1/6] Fix pcmp_* for HVX to to comply with the new definition of true = Scalar(1). --- Eigen/src/Core/arch/HVX/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index ccba96efd..b9080d988 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -401,7 +401,7 @@ EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { template EIGEN_STRONG_INLINE HVXPacket pcmp_le_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true)); } @@ -420,7 +420,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_eq_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } @@ -439,7 +439,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_lt_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } @@ -458,7 +458,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_lt_or_nan_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); + HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } -- GitLab From 7cc169d9476e45760ad72a051ad1c0615d417418 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Mon, 4 Aug 2025 13:41:46 -0700 Subject: [PATCH 2/6] Revert "Fix pcmp_* for HVX to to comply with the new definition of true = Scalar(1)." This reverts commit f8191848ee3ad9599e5393299508b82886d2db44. --- Eigen/src/Core/arch/HVX/PacketMath.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index b9080d988..ccba96efd 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -401,7 +401,7 @@ EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) { template EIGEN_STRONG_INLINE HVXPacket pcmp_le_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true)); } @@ -420,7 +420,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_eq_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } @@ -439,7 +439,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_lt_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } @@ -458,7 +458,7 @@ EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) { template EIGEN_STRONG_INLINE HVXPacket pcmp_lt_or_nan_hvx(const HVXPacket& a, const HVXPacket& b) { - HVX_Vector v_true = Q6_V_vsplat_R(0x3f800000); + HVX_Vector v_true = Q6_Vb_vsplat_R(0xff); HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get()); return HVXPacket::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero())); } -- GitLab From 8ac6776925164fecbcb7d4a4dd4a7f8d7ae08a58 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 31 Oct 2025 14:07:04 -0700 Subject: [PATCH 3/6] Implement assume_aligned using the standard API if available. --- Eigen/src/Core/arch/NEON/Complex.h | 24 +++--- Eigen/src/Core/arch/NEON/PacketMath.h | 119 ++++++++++---------------- Eigen/src/Core/util/Memory.h | 26 +++--- 3 files changed, 70 insertions(+), 99 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index f3f6a1a1b..9c0c28f6f 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -280,13 +280,13 @@ EIGEN_STRONG_INLINE Packet2cf pandnot(const Packet2cf& a, const Packe template <> EIGEN_STRONG_INLINE Packet1cf pload(const std::complex* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(pload((const float*)from)); + EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf( + pload(reinterpret_cast(assume_aligned::alignment>(from)))); } template <> EIGEN_STRONG_INLINE Packet2cf pload(const std::complex* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload(reinterpret_cast(from))); + EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf( + pload(reinterpret_cast(assume_aligned::alignment>(from)))); } template <> @@ -309,13 +309,13 @@ EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* fro template <> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet1cf& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); + EIGEN_DEBUG_ALIGNED_STORE pstore( + reinterpret_cast(assume_aligned::alignment>(to), from.v)); } template <> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet2cf& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); + EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(assume_aligned::alignment>(to)), + from.v); } template <> @@ -531,8 +531,8 @@ struct unpacket_traits : neon_unpacket_default EIGEN_STRONG_INLINE Packet1cd pload(const std::complex* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload(reinterpret_cast(from))); + EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd( + pload(reinterpret_cast(assume_aligned::alignment>(from)))); } template <> @@ -645,8 +645,8 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* fr template <> EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet1cd& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(to), from.v); + EIGEN_DEBUG_ALIGNED_STORE pstore( + reinterpret_cast(assume_aligned::alignment>(to), from.v)); } template <> diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index 36de8af9c..e5cc4d18b 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -2261,13 +2261,11 @@ EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) { template <> EIGEN_STRONG_INLINE Packet2f pload(const float* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet4c pload(const int8_t* from) { @@ -2277,13 +2275,11 @@ EIGEN_STRONG_INLINE Packet4c pload(const int8_t* from) { } template <> EIGEN_STRONG_INLINE Packet8c pload(const int8_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet16c pload(const int8_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet4uc pload(const uint8_t* from) { @@ -2293,63 +2289,51 @@ EIGEN_STRONG_INLINE Packet4uc pload(const uint8_t* from) { } template <> EIGEN_STRONG_INLINE Packet8uc pload(const uint8_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet16uc pload(const uint8_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet4s pload(const int16_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet8s pload(const int16_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet4us pload(const uint16_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet8us pload(const uint16_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet2i pload(const int32_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet2ui pload(const uint32_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet4ui pload(const uint32_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet2l pload(const int64_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(assume_aligned::alignment>(from)); } template <> EIGEN_STRONG_INLINE Packet2ul pload(const uint64_t* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(from); + EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(assume_aligned::alignment>(from)); } template <> @@ -2573,13 +2557,11 @@ EIGEN_STRONG_INLINE Packet4ui ploadquad(const uint32_t* from) { template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet2f& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1_f32(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1_f32(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet4c& from) { @@ -2587,13 +2569,11 @@ EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet4c& from) { } template <> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet8c& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1_s8(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1_s8(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(int8_t* to, const Packet16c& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet4uc& from) { @@ -2601,63 +2581,51 @@ EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet4uc& from) { } template <> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet8uc& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1_u8(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1_u8(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(uint8_t* to, const Packet16uc& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet4s& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1_s16(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1_s16(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(int16_t* to, const Packet8s& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet4us& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1_u16(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1_u16(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(uint16_t* to, const Packet8us& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet2i& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1_s32(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1_s32(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet2ui& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1_u32(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1_u32(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(uint32_t* to, const Packet4ui& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(int64_t* to, const Packet2l& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(assume_aligned::alignment>(to), from); } template <> EIGEN_STRONG_INLINE void pstore(uint64_t* to, const Packet2ul& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(assume_aligned::alignment>(to), from); } template <> @@ -4731,7 +4699,7 @@ EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { template <> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); +unpacket_traits::alignment); return Packet4bf(pload(reinterpret_cast(from))); } @@ -4742,8 +4710,8 @@ EIGEN_STRONG_INLINE Packet4bf ploadu(const bfloat16* from) { template <> EIGEN_STRONG_INLINE void pstore(bfloat16* to, const Packet4bf& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1_u16(reinterpret_cast(to), from); + EIGEN_DEBUG_ALIGNED_STORE vst1_u16( + reinterpret_cast(assume_aligned::alignment>(to)), from); } template <> @@ -5233,7 +5201,7 @@ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { template <> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); +unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } @@ -5248,8 +5216,7 @@ EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { } template <> EIGEN_STRONG_INLINE void pstore(double* to, const Packet2d& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(to, from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(assume_aligned::alignment>(to), from); } template <> @@ -5776,13 +5743,13 @@ EIGEN_STRONG_INLINE Packet4hf pandnot(const Packet4hf& a, const Packe template <> EIGEN_STRONG_INLINE Packet8hf pload(const Eigen::half* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); +unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet4hf pload(const Eigen::half* from) { - EIGEN_ASSUME_ALIGNED(from, unpacket_traits::alignment); +unpacket_traits::alignment); EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); } @@ -5858,14 +5825,14 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, template <> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet8hf& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(reinterpret_cast(to), from); + EIGEN_DEBUG_ALIGNED_STORE vst1q_f16( + reinterpret_cast(assume_aligned::alignment>(to)), from); } template <> EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet4hf& from) { - EIGEN_ASSUME_ALIGNED(to, unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_STORE vst1_f16(reinterpret_cast(to), from); + EIGEN_DEBUG_ALIGNED_STORE vst1_f16( + reinterpret_cast(assume_aligned::alignment>(to)), from); } template <> diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index f8d3c42ec..45290f33a 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -1354,18 +1354,22 @@ EIGEN_DEVICE_FUNC void destroy_at(T* p) { } #endif -/** \internal - * This informs the implementation that PTR is aligned to at least ALIGN_BYTES - */ -#ifndef EIGEN_ASSUME_ALIGNED -#if defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L) -#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \ - { PTR = std::assume_aligned(PTR); } -#elif EIGEN_HAS_BUILTIN(__builtin_assume_aligned) -#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) \ - { PTR = static_cast(__builtin_assume_aligned(PTR, ALIGN_BYTES)); } + +#if !defined(EIGEN_DONT_ASSUME_ALIGNED) && defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L) +template +EIGEN_STRING_INLINE EIGEN_DEVICE_FUNCTION constexpr T* assume_aligned(T* ptr) { + return std::assume_aligned(ptr); +} +#elif !defined(EIGEN_DONT_ASSUME_ALIGNED) && EIGEN_HAS_BUILTIN(__builtin_assume_aligned) +template +EIGEN_STRING_INLINE EIGEN_DEVICE_FUNCTION T* assume_aligned(T* ptr) { + return static_cast(__builtin_assume_aligned(ptr, N)); +} #else -#define EIGEN_ASSUME_ALIGNED(PTR, ALIGN_BYTES) /* do nothing */ +template +EIGEN_STRING_INLINE EIGEN_DEVICE_FUNCTION constexpr T* assume_aligned(T* ptr) { + return ptr; +} #endif #endif -- GitLab From 97b299fa6eab119b670b6cf0ba2f9e18465a2ead Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 31 Oct 2025 14:10:10 -0700 Subject: [PATCH 4/6] Format. --- Eigen/src/Core/arch/NEON/Complex.h | 22 +++++++++++----------- Eigen/src/Core/arch/NEON/PacketMath.h | 8 ++++---- Eigen/src/Core/util/Memory.h | 23 +++++++++++------------ 3 files changed, 26 insertions(+), 27 deletions(-) diff --git a/Eigen/src/Core/arch/NEON/Complex.h b/Eigen/src/Core/arch/NEON/Complex.h index 9c0c28f6f..8b2c5ffe3 100644 --- a/Eigen/src/Core/arch/NEON/Complex.h +++ b/Eigen/src/Core/arch/NEON/Complex.h @@ -48,7 +48,7 @@ struct Packet2cf { }; template <> -struct packet_traits > : default_packet_traits { +struct packet_traits> : default_packet_traits { typedef Packet2cf type; typedef Packet1cf half; enum { @@ -308,22 +308,22 @@ EIGEN_STRONG_INLINE Packet2cf ploaddup(const std::complex* fro } template <> -EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet1cf& from) { +EIGEN_STRONG_INLINE void pstore>(std::complex* to, const Packet1cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore( reinterpret_cast(assume_aligned::alignment>(to), from.v)); } template <> -EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet2cf& from) { +EIGEN_STRONG_INLINE void pstore>(std::complex* to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(reinterpret_cast(assume_aligned::alignment>(to)), from.v); } template <> -EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet1cf& from) { +EIGEN_STRONG_INLINE void pstoreu>(std::complex* to, const Packet1cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); } template <> -EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet2cf& from) { +EIGEN_STRONG_INLINE void pstoreu>(std::complex* to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } @@ -356,7 +356,7 @@ EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf>(std::comp } template <> -EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { +EIGEN_STRONG_INLINE void prefetch>(const std::complex* addr) { EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } @@ -501,7 +501,7 @@ struct Packet1cd { }; template <> -struct packet_traits > : default_packet_traits { +struct packet_traits> : default_packet_traits { typedef Packet1cd type; typedef Packet1cd half; enum { @@ -644,18 +644,18 @@ EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex* fr } template <> -EIGEN_STRONG_INLINE void pstore >(std::complex* to, const Packet1cd& from) { +EIGEN_STRONG_INLINE void pstore>(std::complex* to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore( reinterpret_cast(assume_aligned::alignment>(to), from.v)); } template <> -EIGEN_STRONG_INLINE void pstoreu >(std::complex* to, const Packet1cd& from) { +EIGEN_STRONG_INLINE void pstoreu>(std::complex* to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast(to), from.v); } template <> -EIGEN_STRONG_INLINE void prefetch >(const std::complex* addr) { +EIGEN_STRONG_INLINE void prefetch>(const std::complex* addr) { EIGEN_ARM_PREFETCH(reinterpret_cast(addr)); } @@ -677,7 +677,7 @@ EIGEN_DEVICE_FUNC inline void pscatter, Packet1cd>(std::com template <> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { EIGEN_ALIGN16 std::complex res; - pstore >(&res, a); + pstore>(&res, a); return res; } diff --git a/Eigen/src/Core/arch/NEON/PacketMath.h b/Eigen/src/Core/arch/NEON/PacketMath.h index e5cc4d18b..93da6582b 100644 --- a/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/Eigen/src/Core/arch/NEON/PacketMath.h @@ -4700,7 +4700,7 @@ EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet4bf& from) { template <> EIGEN_STRONG_INLINE Packet4bf pload(const bfloat16* from) { unpacket_traits::alignment); - return Packet4bf(pload(reinterpret_cast(from))); +return Packet4bf(pload(reinterpret_cast(from))); } template <> @@ -5202,7 +5202,7 @@ EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) { template <> EIGEN_STRONG_INLINE Packet2d pload(const double* from) { unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); +EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(from); } template <> @@ -5744,13 +5744,13 @@ EIGEN_STRONG_INLINE Packet4hf pandnot(const Packet4hf& a, const Packe template <> EIGEN_STRONG_INLINE Packet8hf pload(const Eigen::half* from) { unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); +EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(reinterpret_cast(from)); } template <> EIGEN_STRONG_INLINE Packet4hf pload(const Eigen::half* from) { unpacket_traits::alignment); - EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); +EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(reinterpret_cast(from)); } template <> diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 45290f33a..27106f842 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -1294,17 +1294,17 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) { // by default let's use Intel's API queryCacheSizes_intel(l1, l2, l3, max_std_funcs); - // here is the list of other vendors: - // ||cpuid_is_vendor(abcd,"VIA VIA VIA ") - // ||cpuid_is_vendor(abcd,"CyrixInstead") - // ||cpuid_is_vendor(abcd,"CentaurHauls") - // ||cpuid_is_vendor(abcd,"GenuineTMx86") - // ||cpuid_is_vendor(abcd,"TransmetaCPU") - // ||cpuid_is_vendor(abcd,"RiseRiseRise") - // ||cpuid_is_vendor(abcd,"Geode by NSC") - // ||cpuid_is_vendor(abcd,"SiS SiS SiS ") - // ||cpuid_is_vendor(abcd,"UMC UMC UMC ") - // ||cpuid_is_vendor(abcd,"NexGenDriven") + // here is the list of other vendors: + // ||cpuid_is_vendor(abcd,"VIA VIA VIA ") + // ||cpuid_is_vendor(abcd,"CyrixInstead") + // ||cpuid_is_vendor(abcd,"CentaurHauls") + // ||cpuid_is_vendor(abcd,"GenuineTMx86") + // ||cpuid_is_vendor(abcd,"TransmetaCPU") + // ||cpuid_is_vendor(abcd,"RiseRiseRise") + // ||cpuid_is_vendor(abcd,"Geode by NSC") + // ||cpuid_is_vendor(abcd,"SiS SiS SiS ") + // ||cpuid_is_vendor(abcd,"UMC UMC UMC ") + // ||cpuid_is_vendor(abcd,"NexGenDriven") #else l1 = l2 = l3 = -1; #endif @@ -1354,7 +1354,6 @@ EIGEN_DEVICE_FUNC void destroy_at(T* p) { } #endif - #if !defined(EIGEN_DONT_ASSUME_ALIGNED) && defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L) template EIGEN_STRING_INLINE EIGEN_DEVICE_FUNCTION constexpr T* assume_aligned(T* ptr) { -- GitLab From b31798be40872fce2e63268a88b17cb51ae03175 Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 31 Oct 2025 14:27:25 -0700 Subject: [PATCH 5/6] Fix typos. --- Eigen/src/Core/util/Memory.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 27106f842..6d01458b3 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -1356,21 +1356,20 @@ EIGEN_DEVICE_FUNC void destroy_at(T* p) { #if !defined(EIGEN_DONT_ASSUME_ALIGNED) && defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L) template -EIGEN_STRING_INLINE EIGEN_DEVICE_FUNCTION constexpr T* assume_aligned(T* ptr) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) { return std::assume_aligned(ptr); } #elif !defined(EIGEN_DONT_ASSUME_ALIGNED) && EIGEN_HAS_BUILTIN(__builtin_assume_aligned) template -EIGEN_STRING_INLINE EIGEN_DEVICE_FUNCTION T* assume_aligned(T* ptr) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC T* assume_aligned(T* ptr) { return static_cast(__builtin_assume_aligned(ptr, N)); } #else template -EIGEN_STRING_INLINE EIGEN_DEVICE_FUNCTION constexpr T* assume_aligned(T* ptr) { +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) { return ptr; } #endif -#endif } // end namespace internal -- GitLab From 04b3d312d9ccd41435ecbc4cc65f08a397008e7d Mon Sep 17 00:00:00 2001 From: Rasmus Munk Larsen Date: Fri, 31 Oct 2025 14:57:16 -0700 Subject: [PATCH 6/6] Unformat. --- Eigen/src/Core/util/Memory.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 6d01458b3..b1c244c9e 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -1294,17 +1294,17 @@ inline void queryCacheSizes(int& l1, int& l2, int& l3) { // by default let's use Intel's API queryCacheSizes_intel(l1, l2, l3, max_std_funcs); - // here is the list of other vendors: - // ||cpuid_is_vendor(abcd,"VIA VIA VIA ") - // ||cpuid_is_vendor(abcd,"CyrixInstead") - // ||cpuid_is_vendor(abcd,"CentaurHauls") - // ||cpuid_is_vendor(abcd,"GenuineTMx86") - // ||cpuid_is_vendor(abcd,"TransmetaCPU") - // ||cpuid_is_vendor(abcd,"RiseRiseRise") - // ||cpuid_is_vendor(abcd,"Geode by NSC") - // ||cpuid_is_vendor(abcd,"SiS SiS SiS ") - // ||cpuid_is_vendor(abcd,"UMC UMC UMC ") - // ||cpuid_is_vendor(abcd,"NexGenDriven") + // here is the list of other vendors: + // ||cpuid_is_vendor(abcd,"VIA VIA VIA ") + // ||cpuid_is_vendor(abcd,"CyrixInstead") + // ||cpuid_is_vendor(abcd,"CentaurHauls") + // ||cpuid_is_vendor(abcd,"GenuineTMx86") + // ||cpuid_is_vendor(abcd,"TransmetaCPU") + // ||cpuid_is_vendor(abcd,"RiseRiseRise") + // ||cpuid_is_vendor(abcd,"Geode by NSC") + // ||cpuid_is_vendor(abcd,"SiS SiS SiS ") + // ||cpuid_is_vendor(abcd,"UMC UMC UMC ") + // ||cpuid_is_vendor(abcd,"NexGenDriven") #else l1 = l2 = l3 = -1; #endif -- GitLab