From 7b0d78c428828d6ff4ba2ddc30d48b4f6be56d4d Mon Sep 17 00:00:00 2001 From: Gerhard Reitmayr Date: Tue, 23 Jan 2024 13:01:33 -0800 Subject: [PATCH] change to support arbitrary partial vectorization with HVX for dynamic vectors --- Eigen/src/Core/AssignEvaluator.h | 20 ++++++++- Eigen/src/Core/CoreEvaluators.h | 28 +++++++++++++ Eigen/src/Core/Swap.h | 9 ++++ Eigen/src/Core/arch/HVX/PacketMath.h | 44 ++++++++++++++++++++ Eigen/src/Core/functors/AssignmentFunctors.h | 29 +++++++++++++ Eigen/src/Core/util/ConfigureVectorization.h | 3 ++ Eigen/src/Core/util/Memory.h | 4 +- 7 files changed, 133 insertions(+), 4 deletions(-) diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index f7f0b238b..9c9529645 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -419,12 +419,21 @@ struct dense_assignment_loop { dstIsAligned ? 0 : internal::first_aligned(kernel.dstDataPtr(), size); const Index alignedEnd = alignedStart + ((size - alignedStart) / packetSize) * packetSize; +#ifdef EIGEN_VECTORIZE_PARTIAL + if (alignedStart > 0) kernel.template assignPartialPacket<0, 0, PacketType>(0, alignedStart); +#else unaligned_dense_assignment_loop::run(kernel, 0, alignedStart); +#endif // EIGEN_VECTORIZE_PARTIAL - for (Index index = alignedStart; index < alignedEnd; index += packetSize) + Index index = alignedStart; + for (; index < alignedEnd; index += packetSize) kernel.template assignPacket(index); - +#ifdef EIGEN_VECTORIZE_PARTIAL + Index leftover = size - alignedEnd; + if (leftover > 0) kernel.template assignPartialPacket(index, leftover); +#else unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); +#endif // EIGEN_VECTORIZE_PARTIAL } }; @@ -641,6 +650,13 @@ class generic_dense_assignment_kernel { m_functor.template assignPacket(&m_dst.coeffRef(index), m_src.template packet(index)); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPartialPacket(Index index, const int& partial_alignment) { + if (partial_alignment <= 0) return; + m_functor.template assignPartialPacket( + &m_dst.coeffRef(index), m_src.template packet(index), partial_alignment); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { Index row = rowIndexByOuterInner(outer, inner); diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index c6206005e..ad649ae68 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -242,6 +242,11 @@ struct evaluator > : evaluator_base { return pstoret(const_cast(m_d.data) + index, x); } + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, const int& partial_store_alignment) { + return pstoret_partial(const_cast(m_d.data) + index, x, partial_store_alignment); + } + protected: plainobjectbase_evaluator_data m_d; }; @@ -317,6 +322,10 @@ struct unary_evaluator, IndexBased> : evaluator_base(index, x); } + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, const int& partial_store_alignment) { + m_argImpl.template writePartialPacket(index, x, partial_store_alignment); + } protected: evaluator m_argImpl; }; @@ -1032,6 +1041,12 @@ struct mapbase_evaluator : evaluator_base { internal::pstoret(m_data + index * m_innerStride.value(), x); } + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, const int& partial_store_alignment) { + internal::pstoret_partial(m_data + index * m_innerStride.value(), x, + partial_store_alignment, 0); + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowStride() const EIGEN_NOEXCEPT { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); @@ -1435,6 +1450,12 @@ struct evaluator_wrapper_base : evaluator_base { m_argImpl.template writePacket(index, x); } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, const int& partial_store_alignment) { + m_argImpl.template writePartialPacket(index, x, partial_store_alignment); + } + protected: evaluator m_argImpl; }; @@ -1551,6 +1572,13 @@ struct unary_evaluator > : evaluator_base(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x)); } + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, const int& partial_store_alignment) { + enum { PacketSize = unpacket_traits::size }; + m_argImpl.template writePartialPacket(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x), + partial_store_alignment); + } + protected: evaluator m_argImpl; diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h index d417c1ad1..ffbfc024d 100644 --- a/Eigen/src/Core/Swap.h +++ b/Eigen/src/Core/Swap.h @@ -57,6 +57,15 @@ class generic_dense_assignment_kernel(index, tmp); } + template + EIGEN_STRONG_INLINE void assignPartialPacket(Index index, const int &partial_alignment) { + if (partial_alignment <= 0) return; + PacketType tmp = m_src.template packet(index); + const_cast(m_src).template writePartialPacket( + index, m_dst.template packet(index), partial_alignment); + m_dst.template writePartialPacket(index, tmp, partial_alignment); + } + // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I // mean no CRTP (Gael) template diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index 7e139de13..713399dd1 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -49,6 +49,34 @@ EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) { return v; } +// This function stores the first n bytes from vector v to address 'mem'. +// n must be in range 1..128 and mem may have any alignment. Does one or +// two masked stores +template +EIGEN_STRONG_INLINE void vstu_variable(T* mem, uint32_t n, HVX_Vector vin) { + // Rotate as needed. + uintptr_t mem_addr = reinterpret_cast(mem); + + vin = Q6_V_vlalign_VVR(vin, vin, mem_addr); + + uint32_t left_off = mem_addr & (__HVX_LENGTH__ - 1); + uint32_t right_off = left_off + n; + + HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr); + HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off); + + EIGEN_IF_CONSTEXPR(n > Alignment) { + if (right_off > __HVX_LENGTH__) { + Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), vin); + qr = Q6_Q_vcmp_eq_VbVb(vin, vin); + } + } + + ql_not = Q6_Q_or_QQn(ql_not, qr); + Q6_vmem_QnRIV(ql_not, (HVX_Vector*)mem, vin); +} + + template EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) { #if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD) @@ -334,6 +362,22 @@ EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from) { HVX_store_partial::size, 0>(to, from.Get()); } +template <> +EIGEN_STRONG_INLINE void pstoreu_partial(float* to, const Packet32f& from, const Index n, const Index offset) { + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements plus offset will write past end of packet"); + Index store_size = numext::mini(n, packet_size - offset); + vstu_variable<0>(to, sizeof(float) * store_size, Q6_V_valign_VVR(from.Get(), from.Get(), offset * sizeof(float))); +} + +template <> +EIGEN_STRONG_INLINE void pstore_partial(float* to, const Packet32f& from, const Index n, const Index offset) { + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements plus offset will write past end of packet"); + Index store_size = numext::mini(n, packet_size - offset); + vstu_variable<0>(to, sizeof(float) * store_size, Q6_V_valign_VVR(from.Get(), from.Get(), offset * sizeof(float))); +} + template EIGEN_STRONG_INLINE HVXPacket pmul_hvx(const HVXPacket& a, const HVXPacket& b) { return HVXPacket::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()))); diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 09d1da8ca..2732c2b72 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -29,6 +29,11 @@ struct assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, b); } + + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, const int& partial_alignment) const { + internal::pstoret_partial(a, b, partial_alignment, 0); + } }; // Empty overload for void type (used by PermutationMatrix) @@ -56,6 +61,12 @@ struct add_assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, internal::padd(internal::ploadt(a), b)); } + + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, const int& partial_alignment) const { + internal::pstoret_partial( + a, internal::padd(internal::ploadt(a), b), partial_alignment, 0); + } }; template struct functor_traits > { @@ -77,6 +88,12 @@ struct sub_assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, internal::psub(internal::ploadt(a), b)); } + + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, const int& partial_alignment) const { + internal::pstoret_partial( + a, internal::psub(internal::ploadt(a), b), partial_alignment, 0); + } }; template struct functor_traits > { @@ -98,6 +115,12 @@ struct mul_assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, internal::pmul(internal::ploadt(a), b)); } + + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, const int& partial_alignment) const { + internal::pstoret_partial( + a, internal::pmul(internal::ploadt(a), b), partial_alignment, 0); + } }; template struct functor_traits > { @@ -119,6 +142,12 @@ struct div_assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, internal::pdiv(internal::ploadt(a), b)); } + + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, const int& partial_alignment) const { + internal::pstoret_partial( + a, internal::pdiv(internal::ploadt(a), b), partial_alignment, 0); + } }; template struct functor_traits > { diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index b16952a20..9cf04c631 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -419,6 +419,7 @@ extern "C" { #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_HVX +#define EIGEN_VECTORIZE_PARTIAL #include #endif @@ -500,6 +501,8 @@ inline static const char *SimdInstructionSetsInUse(void) { return "S390X ZVECTOR"; #elif defined(EIGEN_VECTORIZE_MSA) return "MIPS MSA"; +#elif defined(EIGEN_VECTORIZE_HVX) + return "HEXAGON HVX"; #else return "None"; #endif diff --git a/Eigen/src/Core/util/Memory.h b/Eigen/src/Core/util/Memory.h index 62534540c..bb3b232ee 100644 --- a/Eigen/src/Core/util/Memory.h +++ b/Eigen/src/Core/util/Memory.h @@ -146,7 +146,7 @@ EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size, check_that_malloc_is_allowed(); EIGEN_USING_STD(malloc) - void* original = malloc(size + alignment); + void* original = malloc(size + 2*alignment); if (original == 0) return 0; uint8_t offset = static_cast(alignment - (reinterpret_cast(original) & (alignment - 1))); void* aligned = static_cast(static_cast(original) + offset); @@ -179,7 +179,7 @@ EIGEN_DEVICE_FUNC inline void* handmade_aligned_realloc(void* ptr, std::size_t n check_that_malloc_is_allowed(); EIGEN_USING_STD(realloc) - void* original = realloc(old_original, new_size + alignment); + void* original = realloc(old_original, new_size + 2*alignment); if (original == nullptr) return nullptr; if (original == old_original) return ptr; uint8_t offset = static_cast(alignment - (reinterpret_cast(original) & (alignment - 1))); -- GitLab