diff --git a/Eigen/src/Core/AssignEvaluator.h b/Eigen/src/Core/AssignEvaluator.h index f7f0b238b8ca70bbc9100262479cc1dbebab9979..e1e05de297269a63466889b77cae138c4d33ffa2 100644 --- a/Eigen/src/Core/AssignEvaluator.h +++ b/Eigen/src/Core/AssignEvaluator.h @@ -291,6 +291,21 @@ struct copy_using_evaluator_innervec_InnerUnrolling +struct copy_using_evaluator_innervec_InnerUnrolling_partial { + using PacketType = typename Kernel::PacketType; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) { + kernel.template assignPartialPacketByOuterInner(outer, Start, Stop - Start, + 0); + } +}; + +template +struct copy_using_evaluator_innervec_InnerUnrolling_partial { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) {} +}; + /*************************************************************************** * Part 3 : implementation of all cases ***************************************************************************/ @@ -354,7 +369,7 @@ struct dense_assignment_loop { ***************************/ // The goal of unaligned_dense_assignment_loop is simply to factorize the handling -// of the non vectorizable beginning and ending parts +// of the non-aligned beginning parts template struct unaligned_dense_assignment_loop { @@ -368,7 +383,7 @@ struct unaligned_dense_assignment_loop { // MSVC must not inline this functions. If it does, it fails to optimize the // packet access path. // FIXME check which version exhibits this issue -#if EIGEN_COMP_MSVC +#if EIGEN_COMP_MSVC < 1936 template static EIGEN_DONT_INLINE void run(Kernel& kernel, Index start, Index end) #else @@ -376,7 +391,22 @@ struct unaligned_dense_assignment_loop { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel, Index start, Index end) #endif { +#if defined(EIGEN_VECTORIZE_PARTIAL) + using PacketType = typename Kernel::PacketType; + constexpr int PacketSize = unpacket_traits::size; + constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment; + + const Index unalignedPacketOps = (end - start) / PacketSize; + const Index unalignedPacketEnd = start + (unalignedPacketOps * PacketSize); + + for (Index index = start; index < unalignedPacketEnd; index += PacketSize) + kernel.template assignPacket(index); + if (end > unalignedPacketEnd) + kernel.template assignPartialPacket(unalignedPacketEnd, + end - unalignedPacketEnd, 0); +#else for (Index index = start; index < end; ++index) kernel.assignCoeff(index); +#endif // EIGEN_VECTORIZE_PARTIAL } }; @@ -401,6 +431,25 @@ struct copy_using_evaluator_linearvec_CompleteUnrolling { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel&) {} }; +template +struct copy_using_evaluator_linearvec_CompleteUnrolling_partial { + typedef typename Kernel::DstEvaluatorType DstEvaluatorType; + typedef typename DstEvaluatorType::XprType DstXprType; + typedef typename Kernel::PacketType PacketType; + + static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment; + static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment; + + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) { + kernel.template assignPartialPacket(Index, Stop - Index, 0); + } +}; + +template +struct copy_using_evaluator_linearvec_CompleteUnrolling_partial { + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) {} +}; + template struct dense_assignment_loop { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { @@ -419,29 +468,47 @@ struct dense_assignment_loop { dstIsAligned ? 0 : internal::first_aligned(kernel.dstDataPtr(), size); const Index alignedEnd = alignedStart + ((size - alignedStart) / packetSize) * packetSize; +#if defined(EIGEN_VECTORIZE_PARTIAL) + if (alignedStart > 0) { + if (alignedStart <= packetSize) + kernel.template assignPartialPacket(0, alignedStart, 0); + else + unaligned_dense_assignment_loop::run(kernel, 0, alignedStart); + } +#else unaligned_dense_assignment_loop::run(kernel, 0, alignedStart); +#endif // EIGEN_VECTORIZE_PARTIAL - for (Index index = alignedStart; index < alignedEnd; index += packetSize) + Index index = alignedStart; + for (; index < alignedEnd; index += packetSize) kernel.template assignPacket(index); - +#if defined(EIGEN_VECTORIZE_PARTIAL) + Index leftover = size - alignedEnd; + if (leftover > 0) kernel.template assignPartialPacket(index, leftover, 0); +#else unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); +#endif // EIGEN_VECTORIZE_PARTIAL } }; template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { - typedef typename Kernel::DstEvaluatorType::XprType DstXprType; - typedef typename Kernel::PacketType PacketType; + using DstXprType = typename Kernel::DstEvaluatorType::XprType; + using PacketType = typename Kernel::PacketType; - enum { - size = DstXprType::SizeAtCompileTime, - packetSize = unpacket_traits::size, - alignedSize = (int(size) / packetSize) * packetSize - }; + static constexpr int Size = DstXprType::SizeAtCompileTime; + static constexpr int PacketSize = unpacket_traits::size; + static constexpr int AlignedSize = (Size / PacketSize) * PacketSize; + static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment; + static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment; - copy_using_evaluator_linearvec_CompleteUnrolling::run(kernel); - copy_using_evaluator_LinearTraversal_CompleteUnrolling::run(kernel); + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { + copy_using_evaluator_linearvec_CompleteUnrolling::run(kernel); + // #if defined(EIGEN_VECTORIZE_PARTIAL) + // copy_using_evaluator_linearvec_CompleteUnrolling_partial::run(kernel); + // #else + copy_using_evaluator_LinearTraversal_CompleteUnrolling::run(kernel); + // #endif // EIGEN_VECTORIZE_PARTIAL } }; @@ -509,6 +576,45 @@ struct dense_assignment_loop { template struct dense_assignment_loop { +#if defined(EIGEN_VECTORIZE_PARTIAL) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { + typedef typename Kernel::Scalar Scalar; + typedef typename Kernel::PacketType PacketType; + enum {packetSize = unpacket_traits::size, + requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment), + alignable = + packet_traits::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment) >= sizeof(Scalar), + dstAlignment = alignable ? int(requestedAlignment) : int(Kernel::AssignmentTraits::DstAlignment)}; + + const Scalar* dst_ptr = kernel.dstDataPtr(); + const Index outerStride = kernel.outerStride(); + const Index innerSize = kernel.innerSize(); + const Index outerSize = kernel.outerSize(); + + for (Index outer = 0; outer < outerSize; ++outer) { + Index alignedStart = internal::first_aligned(dst_ptr, innerSize); + + Index inner = 0; + + // do the unaligned portion of the assignment + for (; inner + packetSize <= alignedStart; inner += packetSize) + kernel.template assignPacketByOuterInner(outer, inner); + if (alignedStart > inner) + kernel.template assignPartialPacketByOuterInner(outer, inner, + alignedStart - inner, 0); + + inner = alignedStart; + + // do the aligned portion of the assignment + for (; inner + packetSize <= innerSize; inner += packetSize) + kernel.template assignPacketByOuterInner(outer, inner); + if (innerSize > inner) + kernel.template assignPartialPacketByOuterInner(outer, inner, + innerSize - inner, 0); + dst_ptr += outerStride; + } + } +#else EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE EIGEN_CONSTEXPR void run(Kernel& kernel) { typedef typename Kernel::Scalar Scalar; typedef typename Kernel::PacketType PacketType; @@ -547,6 +653,7 @@ struct dense_assignment_loop { alignedStart = numext::mini((alignedStart + alignedStep) % packetSize, innerSize); } } +#endif // EIGEN_VECTORIZE_PARTIAL }; #if EIGEN_UNALIGNED_VECTORIZE @@ -564,8 +671,14 @@ struct dense_assignment_loop { }; for (Index outer = 0; outer < kernel.outerSize(); ++outer) { - copy_using_evaluator_innervec_InnerUnrolling::run(kernel, outer); + copy_using_evaluator_innervec_InnerUnrolling::run(kernel, + outer); +#if defined(EIGEN_VECTORIZE_PARTIAL) + copy_using_evaluator_innervec_InnerUnrolling_partial::run(kernel, outer); +#else copy_using_evaluator_DefaultTraversal_InnerUnrolling::run(kernel, outer); +#endif // EIGEN_VECTORIZE_PARTIAL } } }; @@ -641,6 +754,18 @@ class generic_dense_assignment_kernel { m_functor.template assignPacket(&m_dst.coeffRef(index), m_src.template packet(index)); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPartialPacket(Index row, Index col, Index n, Index offset) { + m_functor.template assignPartialPacket( + &m_dst.coeffRef(row, col), m_src.template partialPacket(row, col, n, offset), n, offset); + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPartialPacket(Index index, Index n, Index offset) { + m_functor.template assignPartialPacket( + &m_dst.coeffRef(index), m_src.template partialPacket(index, n, offset), n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) { Index row = rowIndexByOuterInner(outer, inner); @@ -648,6 +773,14 @@ class generic_dense_assignment_kernel { assignPacket(row, col); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPartialPacketByOuterInner(Index outer, Index inner, Index n, + Index offset) { + Index row = rowIndexByOuterInner(outer, inner); + Index col = colIndexByOuterInner(outer, inner); + assignPartialPacket(row, col, n, offset); + } + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) { typedef typename DstEvaluatorType::ExpressionTraits Traits; return int(Traits::RowsAtCompileTime) == 1 ? 0 diff --git a/Eigen/src/Core/CoreEvaluators.h b/Eigen/src/Core/CoreEvaluators.h index 3d78fd8be712af0c00d32368b4477e5de78ed28c..44833d7493a0eff11ed4a9f7f5dcbd377bec196b 100644 --- a/Eigen/src/Core/CoreEvaluators.h +++ b/Eigen/src/Core/CoreEvaluators.h @@ -242,6 +242,34 @@ struct evaluator> : evaluator_base { return pstoret(const_cast(m_d.data) + index, x); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + if (IsRowMajor) + return ploadt_partial(m_d.data + row * m_d.outerStride() + col, n, offset); + else + return ploadt_partial(m_d.data + row + col * m_d.outerStride(), n, offset); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + return ploadt_partial(m_d.data + index, n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index row, Index col, const PacketType& x, Index n, Index offset) { + if (IsRowMajor) + pstoret_partial(const_cast(m_d.data) + row * m_d.outerStride() + col, x, + n, offset); + else + pstoret_partial(const_cast(m_d.data) + row + col * m_d.outerStride(), x, + n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, Index n, Index offset) { + pstoret_partial(const_cast(m_d.data) + index, x, n, offset); + } + protected: plainobjectbase_evaluator_data m_d; }; @@ -315,6 +343,26 @@ struct unary_evaluator, IndexBased> : evaluator_base(index, x); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + return m_argImpl.template partialPacket(col, row, n, offset); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + return m_argImpl.template partialPacket(index, n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index row, Index col, const PacketType& x, Index n, Index offset) { + m_argImpl.template writePartialPacket(col, row, x, n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, Index n, Index offset) { + m_argImpl.template writePartialPacket(index, x, n, offset); + } + protected: evaluator m_argImpl; }; @@ -499,6 +547,16 @@ struct evaluator> return m_wrapper.template packetOp(m_functor, index); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(IndexType row, IndexType col, Index, Index) const { + return packet(row, col); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(IndexType index, Index, Index) const { + return packet(index); + } + protected: const NullaryOp m_functor; const internal::nullary_wrapper m_wrapper; @@ -543,6 +601,16 @@ struct unary_evaluator, IndexBased> : evaluator_b return m_d.func().packetOp(m_d.argImpl.template packet(index)); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + return m_d.func().packetOp(m_d.argImpl.template partialPacket(row, col, n, offset)); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + return m_d.func().packetOp(m_d.argImpl.template partialPacket(index, n, offset)); + } + protected: // this helper permits to completely eliminate the functor if it is empty struct Data { @@ -642,30 +710,43 @@ struct unary_evaluator, ArgType>, In return m_argImpl.template packet(actualIndex); } + template + EIGEN_STRONG_INLINE PacketType srcPartialPacket(Index row, Index col, Index offset, Index n, Index loadOffset) const { + constexpr Index PacketSize = unpacket_traits::size; + Index packetStart = offset * PacketSize; + Index actualRow = IsRowMajor ? row : row + packetStart; + Index actualCol = IsRowMajor ? col + packetStart : col; + Index start = numext::mini(numext::maxi(loadOffset - packetStart, Index(0)), PacketSize); + Index end = numext::mini(numext::maxi(loadOffset + n - packetStart, Index(0)), PacketSize); + if (end == start) return pzero(PacketType()); + eigen_assert(check_array_bounds(actualRow + start, actualCol + start, end - start) && "Array index out of bounds"); + return m_argImpl.template partialPacket(actualRow, actualCol, end - start, start); + } + + template + EIGEN_STRONG_INLINE PacketType srcPartialPacket(Index index, Index offset, Index n, Index loadOffset) const { + constexpr Index PacketSize = unpacket_traits::size; + Index packetStart = offset * PacketSize; + Index actualIndex = index + packetStart; + Index start = numext::mini(numext::maxi(loadOffset - packetStart, Index(0)), PacketSize); + Index end = numext::mini(numext::maxi(loadOffset + n - packetStart, Index(0)), PacketSize); + if (end == start) return pzero(PacketType()); + eigen_assert(check_array_bounds(actualIndex + start, end - start) && "Array index out of bounds"); + return m_argImpl.template partialPacket(actualIndex, end - start, start); + } + // There is no source packet type with equal or fewer elements than DstPacketType. // This is problematic as the evaluation loop may attempt to access data outside the bounds of the array. // For example, consider the cast utilizing pcast with an array of size 4: {0.0f,1.0f,2.0f,3.0f}. // The first iteration of the evaluation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which // is acceptable. The second iteration will load 16 bytes: {2.0f,3.0f,?,?}, which is outside the bounds of the array. - // Instead, perform runtime check to determine if the load would access data outside the bounds of the array. - // If not, perform full load. Otherwise, revert to a scalar loop to perform a partial load. - // In either case, perform a vectorized cast of the source packet. template = true> EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const { constexpr int DstPacketSize = unpacket_traits::size; constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); - SrcPacketType src; - if (EIGEN_PREDICT_TRUE(check_array_bounds(row, col, SrcPacketSize))) { - src = srcPacket(row, col, 0); - } else { - Array srcArray; - for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(row, col, k); - for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0); - src = pload(srcArray.data()); - } - return pcast(src); + return pcast(srcPartialPacket(row, col, 0, DstPacketSize, 0)); } // Use the source packet type with the same size as DstPacketType, if it exists template = true> @@ -701,22 +782,52 @@ struct unary_evaluator, ArgType>, In srcPacket(row, col, 6), srcPacket(row, col, 7)); } - // Analogous routines for linear access. + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index row, Index col, Index n, Index offset) const { + constexpr int DstPacketSize = unpacket_traits::size; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast(srcPartialPacket(row, col, 0, n, offset)); + } + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index row, Index col, Index n, Index offset) const { + constexpr int DstPacketSize = unpacket_traits::size; + using SizedSrcPacketType = typename find_packet_by_size::type; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast( + srcPartialPacket(row, col, 0, n, offset)); + } + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index row, Index col, Index n, Index offset) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + return pcast(srcPartialPacket(row, col, 0, n, offset), + srcPartialPacket(row, col, 1, n, offset)); + } + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index row, Index col, Index n, Index offset) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + return pcast( + srcPartialPacket(row, col, 0, n, offset), srcPartialPacket(row, col, 1, n, offset), + srcPartialPacket(row, col, 2, n, offset), srcPartialPacket(row, col, 3, n, offset)); + } + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index row, Index col, Index n, Index offset) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + return pcast( + srcPartialPacket(row, col, 0, n, offset), srcPartialPacket(row, col, 1, n, offset), + srcPartialPacket(row, col, 2, n, offset), srcPartialPacket(row, col, 3, n, offset), + srcPartialPacket(row, col, 4, n, offset), srcPartialPacket(row, col, 5, n, offset), + srcPartialPacket(row, col, 6, n, offset), srcPartialPacket(row, col, 7, n, offset)); + } + + // Analagous routines for linear access. template = true> EIGEN_STRONG_INLINE DstPacketType packet(Index index) const { constexpr int DstPacketSize = unpacket_traits::size; constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); - SrcPacketType src; - if (EIGEN_PREDICT_TRUE(check_array_bounds(index, SrcPacketSize))) { - src = srcPacket(index, 0); - } else { - Array srcArray; - for (size_t k = 0; k < DstPacketSize; k++) srcArray[k] = srcCoeff(index, k); - for (size_t k = DstPacketSize; k < SrcPacketSize; k++) srcArray[k] = SrcType(0); - src = pload(srcArray.data()); - } - return pcast(src); + return pcast(srcPartialPacket(index, 0, DstPacketSize, 0)); } template = true> EIGEN_STRONG_INLINE DstPacketType packet(Index index) const { @@ -746,6 +857,45 @@ struct unary_evaluator, ArgType>, In srcPacket(index, 6), srcPacket(index, 7)); } + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index index, Index n, Index offset) const { + constexpr int DstPacketSize = unpacket_traits::size; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast(srcPartialPacket(index, 0, n, offset)); + } + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index index, Index n, Index offset) const { + constexpr int DstPacketSize = unpacket_traits::size; + using SizedSrcPacketType = typename find_packet_by_size::type; + constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType); + constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode); + return pcast( + srcPartialPacket(index, 0, n, offset)); + } + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index index, Index n, Index offset) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + return pcast(srcPartialPacket(index, 0, n, offset), + srcPartialPacket(index, 1, n, offset)); + } + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index index, Index n, Index offset) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + return pcast( + srcPartialPacket(index, 0, n, offset), srcPartialPacket(index, 1, n, offset), + srcPartialPacket(index, 2, n, offset), srcPartialPacket(index, 3, n, offset)); + } + template = true> + EIGEN_STRONG_INLINE DstPacketType partialPacket(Index index, Index n, Index offset) const { + constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode); + return pcast( + srcPartialPacket(index, 0, n, offset), srcPartialPacket(index, 1, n, offset), + srcPartialPacket(index, 2, n, offset), srcPartialPacket(index, 3, n, offset), + srcPartialPacket(index, 4, n, offset), srcPartialPacket(index, 5, n, offset), + srcPartialPacket(index, 6, n, offset), srcPartialPacket(index, 7, n, offset)); + } + constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_rows; } constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_cols; } constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_rows * m_cols; } @@ -823,6 +973,20 @@ struct ternary_evaluator, IndexBased m_d.arg3Impl.template packet(index)); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + return m_d.func().packetOp(m_d.arg1Impl.template partialPacket(row, col, n, offset), + m_d.arg2Impl.template partialPacket(row, col, n, offset), + m_d.arg3Impl.template partialPacket(row, col, n, offset)); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + return m_d.func().packetOp(m_d.arg1Impl.template partialPacket(index, n, offset), + m_d.arg2Impl.template partialPacket(index, n, offset), + m_d.arg3Impl.template partialPacket(index, n, offset)); + } + protected: // this helper permits to completely eliminate the functor if it is empty struct Data { @@ -919,6 +1083,18 @@ struct binary_evaluator, IndexBased, IndexBase m_d.rhsImpl.template packet(index)); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + return m_d.func().packetOp(m_d.lhsImpl.template partialPacket(row, col, n, offset), + m_d.rhsImpl.template partialPacket(row, col, n, offset)); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + return m_d.func().packetOp(m_d.lhsImpl.template partialPacket(index, n, offset), + m_d.rhsImpl.template partialPacket(index, n, offset)); + } + protected: // this helper permits to completely eliminate the functor if it is empty struct Data { @@ -1051,6 +1227,28 @@ struct mapbase_evaluator : evaluator_base { internal::pstoret(m_data + index * m_innerStride.value(), x); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + PointerType ptr = m_data + row * rowStride() + col * colStride(); + return internal::ploadt_partial(ptr, n, offset); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + return internal::ploadt_partial(m_data + index * m_innerStride.value(), n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index row, Index col, const PacketType& x, Index n, Index offset) { + PointerType ptr = m_data + row * rowStride() + col * colStride(); + return internal::pstoret_partial(ptr, x, n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, Index n, Index offset) { + internal::pstoret_partial(m_data + index * m_innerStride.value(), x, n, offset); + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EIGEN_CONSTEXPR Index rowStride() const EIGEN_NOEXCEPT { return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value(); @@ -1243,6 +1441,37 @@ struct unary_evaluator, IndexBa x); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + return m_argImpl.template partialPacket(m_startRow.value() + row, m_startCol.value() + col, n, + offset); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + if (ForwardLinearAccess) + return m_argImpl.template partialPacket(m_linear_offset.value() + index, n, offset); + else + return partialPacket(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0, + n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index row, Index col, const PacketType& x, Index n, Index offset) { + return m_argImpl.template writePartialPacket(m_startRow.value() + row, + m_startCol.value() + col, x, n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, Index n, Index offset) { + if (ForwardLinearAccess) + return m_argImpl.template writePartialPacket(m_linear_offset.value() + index, x, n, + offset); + else + return writePartialPacket(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0, x, n, offset); + } + protected: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const { @@ -1398,6 +1627,27 @@ struct unary_evaluator> return m_argImpl.template packet(actual_index); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + const Index actual_row = internal::traits::RowsAtCompileTime == 1 ? 0 + : RowFactor == 1 ? row + : row % m_rows.value(); + const Index actual_col = internal::traits::ColsAtCompileTime == 1 ? 0 + : ColFactor == 1 ? col + : col % m_cols.value(); + + return m_argImpl.template partialPacket(actual_row, actual_col, n, offset); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + const Index actual_index = internal::traits::RowsAtCompileTime == 1 + ? (ColFactor == 1 ? index : index % m_cols.value()) + : (RowFactor == 1 ? index : index % m_rows.value()); + + return m_argImpl.template partialPacket(actual_index, n, offset); + } + protected: const ArgTypeNested m_arg; evaluator m_argImpl; @@ -1454,6 +1704,26 @@ struct evaluator_wrapper_base : evaluator_base { m_argImpl.template writePacket(index, x); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + return m_argImpl.template partialPacket(row, col, n, offset); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + return m_argImpl.template partialPacket(index, n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index row, Index col, const PacketType& x, Index n, Index offset) { + m_argImpl.template writePartialPacket(row, col, x, n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, Index n, Index offset) { + m_argImpl.template writePartialPacket(index, x, n, offset); + } + protected: evaluator m_argImpl; }; @@ -1480,6 +1750,9 @@ struct unary_evaluator> : evaluator_wrapper_base struct reverse_packet_cond; +template +struct reverse_packetpartial_cond; + template struct unary_evaluator> : evaluator_base> { typedef Reverse XprType; @@ -1570,6 +1843,70 @@ struct unary_evaluator> : evaluator_base(m_rows.value() * m_cols.value() - index - PacketSize, preverse(x)); } + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + // using impl = reverse_packet_cond; + // static constexpr int PacketSize = unpacket_traits::size; + // static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + // static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + + // Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + // Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + // Index actualOffset = ReversePacket ? (PacketSize - n - offset) : offset; + + // return impl::run(m_argImpl.template partialPacket(actualRow, actualCol, n, actualOffset)); + + Index OffsetRow = ReverseRow && IsColMajor ? n : 1; + Index OffsetCol = ReverseCol && IsRowMajor ? n : 1; + typedef internal::reverse_packetpartial_cond reverse_packet; + return reverse_packet::run(m_argImpl.template partialPacket( + ReverseRow ? m_rows.value() - row - OffsetRow : row, + ReverseCol ? m_cols.value() - col - OffsetCol : col, n, offset), + n); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index index, Index n, Index offset) const { + // static constexpr int PacketSize = unpacket_traits::size; + // Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + // Index actualOffset = PacketSize - n - offset; + + return preverse_partial( + m_argImpl.template partialPacket(m_rows.value() * m_cols.value() - index - n, n, offset), + n); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index row, Index col, const PacketType& x, Index n, Index offset) { + // using impl = reverse_packet_cond; + // static constexpr int PacketSize = unpacket_traits::size; + // static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1; + // static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1; + + // Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row; + // Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col; + // Index actualOffset = ReversePacket ? (PacketSize - n - offset) : offset; + + // m_argImpl.template writePartialPacket(actualRow, actualCol, impl::run(x), n, actualOffset); + Index OffsetRow = ReverseRow && IsColMajor ? n : 1; + Index OffsetCol = ReverseCol && IsRowMajor ? n : 1; + typedef internal::reverse_packetpartial_cond reverse_packet; + m_argImpl.template writePartialPacket(ReverseRow ? m_rows.value() - row - OffsetRow : row, + ReverseCol ? m_cols.value() - col - OffsetCol : col, + reverse_packet::run(x, n), n, offset); + } + + template + EIGEN_STRONG_INLINE void writePartialPacket(Index index, const PacketType& x, Index n, Index offset) { + // static constexpr int PacketSize = unpacket_traits::size; + + // Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize; + // Index actualOffset = PacketSize - n - offset; + + m_argImpl.template writePartialPacket(m_rows.value() * m_cols.value() - index - n, + preverse_partial(x, n), n, offset); + } + protected: evaluator m_argImpl; diff --git a/Eigen/src/Core/GenericPacketMath.h b/Eigen/src/Core/GenericPacketMath.h index 1d79b4ab8ff6556316b15af9d5ce1743844e5cae..ecb973272b0dfcc5ed6ab02bc9c13827a103badf 100644 --- a/Eigen/src/Core/GenericPacketMath.h +++ b/Eigen/src/Core/GenericPacketMath.h @@ -986,6 +986,20 @@ EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) { return a; } +/** \internal \returns the reversed partial first n elements of \a a*/ +template +EIGEN_DEVICE_FUNC inline Packet preverse_partial(const Packet& a, const Index n) { + const Index packet_size = unpacket_traits::size; + eigen_assert(n <= packet_size && "number of elements will reverse past end of packet"); + EIGEN_ALIGN_MAX Scalar elements[packet_size]; + pstore(elements, a); + EIGEN_USING_STD(swap); + for (Index i = n - 1; i >= n / 2; i--) { + swap(elements[i], elements[n - 1 - i]); + } + return pload(elements); +} + /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) { diff --git a/Eigen/src/Core/PartialReduxEvaluator.h b/Eigen/src/Core/PartialReduxEvaluator.h index 7b2c8dca38f1ac941819e1d5f1b50ba91f63c4bf..e69a01d74016a856f804b6baa92aff88fc8b98aa 100644 --- a/Eigen/src/Core/PartialReduxEvaluator.h +++ b/Eigen/src/Core/PartialReduxEvaluator.h @@ -120,6 +120,36 @@ struct packetwise_redux_impl { } }; +/* Perform the actual reduction of partial packets */ +// TODO: implement unrolling +template +struct partial_packetwise_redux_impl { + typedef typename Evaluator::Scalar Scalar; + typedef typename redux_traits::PacketType PacketScalar; + + template + EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size, Index n, Index offset) { + if (size == 0 || n == 0) return packetwise_redux_empty_value(func); + + const Index size4 = (size - 1) & (~3); + PacketType p = eval.template partialPacketByOuterInner(0, 0, n, offset); + Index i = 1; + // This loop is optimized for instruction pipelining: + // - each iteration generates two independent instructions + // - thanks to branch prediction and out-of-order execution we have independent instructions across loops + for (; i < size4; i += 4) + p = func.packetOp( + p, func.packetOp( + func.packetOp(eval.template partialPacketByOuterInner(i + 0, 0, n, offset), + eval.template partialPacketByOuterInner(i + 1, 0, n, offset)), + func.packetOp(eval.template partialPacketByOuterInner(i + 2, 0, n, offset), + eval.template partialPacketByOuterInner(i + 3, 0, n, offset)))); + for (; i < size; ++i) + p = func.packetOp(p, eval.template partialPacketByOuterInner(i, 0, n, offset)); + return p; + } +}; + template struct evaluator > : evaluator_base > { @@ -197,6 +227,35 @@ struct evaluator > return p; } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType partialPacket(Index i, Index j, Index n, Index offset) const { + return partialPacket(Direction == Vertical ? j : i, n, offset); + } + + template + EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType partialPacket(Index idx, Index n, Index offset) const { + enum { PacketSize = internal::unpacket_traits::size }; + typedef Block + PanelType; + + PanelType panel(m_arg, Direction == Vertical ? 0 : idx, Direction == Vertical ? idx : 0, + Direction == Vertical ? m_arg.rows() : n, Direction == Vertical ? n : m_arg.cols()); + + // FIXME + // See bug 1612, currently if PacketSize==1 (i.e. complex with 128bits registers) then the storage-order of + // panel get reversed and methods like packetByOuterInner do not make sense anymore in this context. So let's just + // by pass "vectorization" in this case: + if (PacketSize == 1) return internal::pset1(coeff(idx)); + + typedef typename internal::redux_evaluator PanelEvaluator; + PanelEvaluator panel_eval(panel); + typedef typename MemberOp::BinaryOp BinaryOp; + PacketType p = internal::partial_packetwise_redux_impl::template run( + panel_eval, m_functor.binaryFunc(), m_arg.outerSize(), n, offset); + return p; + } + protected: ConstArgTypeNested m_arg; const MemberOp m_functor; diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h index 77a658a8ef953cd4c01fce86b771ba0b0fca87d6..a79ddd304bbaa023d0dfd670847adf7a070ed144 100644 --- a/Eigen/src/Core/ProductEvaluators.h +++ b/Eigen/src/Core/ProductEvaluators.h @@ -631,6 +631,24 @@ struct product_evaluator, ProductTag, DenseShape, return packet(row, col); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType partialPacket(Index row, Index col, Index n, + Index offset) const { + PacketType res; + typedef etor_product_packet_impl + PacketImpl; + PacketImpl::run_partial(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, n, offset); + return res; + } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType partialPacket(Index index, Index n, Index offset) const { + const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index; + const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0; + return partialPacket(row, col, n, offset); + } + protected: add_const_on_value_type_t m_lhs; add_const_on_value_type_t m_rhs; @@ -666,6 +684,13 @@ struct etor_product_packet_impl(lhs.coeff(row, Index(UnrollingIndex - 1))), rhs.template packet(Index(UnrollingIndex - 1), col), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_partial(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index n, Index offset) { + etor_product_packet_impl::run_partial( + row, col, lhs, rhs, innerDim, res, n, offset); + res = pmadd(pset1(lhs.coeff(row, Index(UnrollingIndex - 1))), + rhs.template partialPacket(Index(UnrollingIndex - 1), col, n, offset), res); + } }; template @@ -677,6 +702,13 @@ struct etor_product_packet_impl(row, Index(UnrollingIndex - 1)), pset1(rhs.coeff(Index(UnrollingIndex - 1), col)), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_partial(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index n, Index offset) { + etor_product_packet_impl::run_partial( + row, col, lhs, rhs, innerDim, res, n, offset); + res = pmadd(lhs.template partialPacket(row, Index(UnrollingIndex - 1), n, offset), + pset1(rhs.coeff(Index(UnrollingIndex - 1), col)), res); + } }; template @@ -685,6 +717,12 @@ struct etor_product_packet_impl { Index /*innerDim*/, Packet& res) { res = pmul(pset1(lhs.coeff(row, Index(0))), rhs.template packet(Index(0), col)); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_partial(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index /*innerDim*/, Packet& res, Index n, + Index offset) { + res = pmul(pset1(lhs.coeff(row, Index(0))), + rhs.template partialPacket(Index(0), col, n, offset)); + } }; template @@ -693,13 +731,24 @@ struct etor_product_packet_impl { Index /*innerDim*/, Packet& res) { res = pmul(lhs.template packet(row, Index(0)), pset1(rhs.coeff(Index(0), col))); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_partial(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index /*innerDim*/, Packet& res, Index n, + Index offset) { + res = pmul(lhs.template partialPacket(row, Index(0), n, offset), + pset1(rhs.coeff(Index(0), col))); + } }; template struct etor_product_packet_impl { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) { - res = pset1(typename unpacket_traits::type(0)); + res = pzero(res); + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_partial(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, + const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res, + Index /*n*/, Index /*offset*/) { + res = pzero(res); } }; @@ -707,7 +756,12 @@ template struct etor_product_packet_impl { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) { - res = pset1(typename unpacket_traits::type(0)); + res = pzero(res); + } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_partial(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, + const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res, + Index /*n*/, Index /*offset*/) { + res = pzero(res); } }; @@ -715,20 +769,34 @@ template struct etor_product_packet_impl { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { - res = pset1(typename unpacket_traits::type(0)); + res = pzero(res); for (Index i = 0; i < innerDim; ++i) res = pmadd(pset1(lhs.coeff(row, i)), rhs.template packet(i, col), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_partial(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index n, Index offset) { + res = pzero(res); + for (Index i = 0; i < innerDim; ++i) + res = + pmadd(pset1(lhs.coeff(row, i)), rhs.template partialPacket(i, col, n, offset), res); + } }; template struct etor_product_packet_impl { static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Index innerDim, Packet& res) { - res = pset1(typename unpacket_traits::type(0)); + res = pzero(res); for (Index i = 0; i < innerDim; ++i) res = pmadd(lhs.template packet(row, i), pset1(rhs.coeff(i, col)), res); } + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_partial(Index row, Index col, const Lhs& lhs, const Rhs& rhs, + Index innerDim, Packet& res, Index n, Index offset) { + res = pzero(res); + for (Index i = 0; i < innerDim; ++i) + res = + pmadd(lhs.template partialPacket(row, i, n, offset), pset1(rhs.coeff(i, col)), res); + } }; /*************************************************************************** @@ -867,6 +935,26 @@ struct diagonal_product_evaluator_base : evaluator_base { m_diagImpl.template packet(id)); } + template + EIGEN_STRONG_INLINE PacketType partial_packet_impl(Index row, Index col, Index id, Index n, Index offset, + internal::true_type) const { + return internal::pmul(m_matImpl.template partialPacket(row, col, n, offset), + internal::pset1(m_diagImpl.coeff(id))); + } + + template + EIGEN_STRONG_INLINE PacketType partial_packet_impl(Index row, Index col, Index id, Index n, Index offset, + internal::false_type) const { + enum { + InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime, + DiagonalPacketLoadMode = plain_enum_min( + LoadMode, + ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator::Alignment)) // FIXME hardcoded 16!! + }; + return internal::pmul(m_matImpl.template partialPacket(row, col, n, offset), + m_diagImpl.template partialPacket(id, n, offset)); + } + evaluator m_diagImpl; evaluator m_matImpl; }; @@ -910,6 +998,19 @@ struct product_evaluator, ProductTag, DiagonalSha return packet(int(StorageOrder) == ColMajor ? idx : 0, int(StorageOrder) == ColMajor ? 0 : idx); } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + return this->template partial_packet_impl( + row, col, row, n, offset, + std::conditional_t()); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index idx, Index n, Index offset) const { + return partialPacket(int(StorageOrder) == ColMajor ? idx : 0, + int(StorageOrder) == ColMajor ? 0 : idx, n, offset); + } #endif }; @@ -949,6 +1050,19 @@ struct product_evaluator, ProductTag, DenseShape, return packet(int(StorageOrder) == ColMajor ? idx : 0, int(StorageOrder) == ColMajor ? 0 : idx); } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index row, Index col, Index n, Index offset) const { + return this->template partial_packet_impl( + row, col, col, n, offset, + std::conditional_t()); + } + + template + EIGEN_STRONG_INLINE PacketType partialPacket(Index idx, Index n, Index offset) const { + return partialPacket(int(StorageOrder) == ColMajor ? idx : 0, + int(StorageOrder) == ColMajor ? 0 : idx, n, offset); + } #endif }; diff --git a/Eigen/src/Core/Redux.h b/Eigen/src/Core/Redux.h index 0c5f2d9f6b6859bb8cfa4c23e8ee96f3fb957df9..123ac4ff611d4f9b7221b6802d4901b6bd796aed 100644 --- a/Eigen/src/Core/Redux.h +++ b/Eigen/src/Core/Redux.h @@ -414,6 +414,13 @@ class redux_evaluator : public internal::evaluator { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const { return Base::template packet(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType partialPacketByOuterInner(Index outer, Index inner, Index n, + Index offset) const { + return Base::template partialPacket(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer, n, + offset); + } }; } // end namespace internal diff --git a/Eigen/src/Core/Reverse.h b/Eigen/src/Core/Reverse.h index 66116aa4ee34367ecde0b944a49a8d417bf19b4e..2a5877fe6520a713956ce4d05572cceee7c6dd41 100644 --- a/Eigen/src/Core/Reverse.h +++ b/Eigen/src/Core/Reverse.h @@ -40,11 +40,24 @@ struct reverse_packet_cond { static inline PacketType run(const PacketType& x) { return preverse(x); } }; +template +struct reverse_packetpartial_cond { + static inline PacketType run(const PacketType& x, const Index n) { return preverse_partial(x, n); } +}; + template struct reverse_packet_cond { static inline PacketType run(const PacketType& x) { return x; } }; +template +struct reverse_packetpartial_cond { + static inline PacketType run(const PacketType& x, const Index n) { + EIGEN_UNUSED_VARIABLE(n); + return x; + } +}; + } // end namespace internal /** \class Reverse diff --git a/Eigen/src/Core/Swap.h b/Eigen/src/Core/Swap.h index d417c1ad1ee7c5128f6ac03d286c3e17dbf2b7ad..2cba12c2e54a4c9c875f6d506d44eb1ec75f29cb 100644 --- a/Eigen/src/Core/Swap.h +++ b/Eigen/src/Core/Swap.h @@ -65,6 +65,31 @@ class generic_dense_assignment_kernel(row, col); } + + template + EIGEN_STRONG_INLINE void assignPartialPacket(Index row, Index col, Index n, Index offset) { + PacketType tmp = m_src.template partialPacket(row, col, n, offset); + const_cast(m_src).template writePartialPacket( + row, col, m_dst.template partialPacket(row, col, n, offset), n, offset); + m_dst.template writePartialPacket(row, col, tmp, n, offset); + } + + template + EIGEN_STRONG_INLINE void assignPartialPacket(Index index, Index n, Index offset) { + PacketType tmp = m_src.template partialPacket(index, n, offset); + const_cast(m_src).template writePartialPacket( + index, m_dst.template partialPacket(index, n, offset), n, offset); + m_dst.template writePartialPacket(index, tmp, n, offset); + } + + // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I + // mean no CRTP (Gael) + template + EIGEN_STRONG_INLINE void assignPartialPacketByOuterInner(Index outer, Index inner, Index n, Index offset) { + Index row = Base::rowIndexByOuterInner(outer, inner); + Index col = Base::colIndexByOuterInner(outer, inner); + assignPartialPacket(row, col, n, offset); + } }; } // namespace internal diff --git a/Eigen/src/Core/arch/HVX/PacketMath.h b/Eigen/src/Core/arch/HVX/PacketMath.h index ccba96efd7726efeedeb466231a3dc08be132970..380e7740c4dea1fcea78a84dcaa424b09a7509e3 100644 --- a/Eigen/src/Core/arch/HVX/PacketMath.h +++ b/Eigen/src/Core/arch/HVX/PacketMath.h @@ -15,6 +15,9 @@ #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif +#ifndef EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD +#define EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD +#endif namespace Eigen { namespace internal { @@ -49,6 +52,66 @@ EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) { return v; } +// This function stores the first n bytes from vector v to address 'mem'. +// n must be in range 1..128 and mem may have any alignment. +template +EIGEN_STRONG_INLINE void vstu_variable(T* mem, const size_t& n, HVX_Vector vin) { + // Rotate as needed. + uintptr_t mem_addr = reinterpret_cast(mem); + + vin = Q6_V_vlalign_VVR(vin, vin, mem_addr); + + size_t left_off = mem_addr & (__HVX_LENGTH__ - 1); + size_t right_off = left_off + n; + + HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr); + HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off); + + EIGEN_IF_CONSTEXPR(n > Alignment) { + if (right_off > __HVX_LENGTH__) { + Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), vin); + qr = Q6_Q_vcmp_eq_VbVb(vin, vin); + } + } + + ql_not = Q6_Q_or_QQn(ql_not, qr); + Q6_vmem_QnRIV(ql_not, mem, vin); +} + +// This function load the first n bytes from vector v to address 'mem'. +// n must be in range 1..128 and mem may have any alignment. +// All elements after the last element loaded will initialized with zero +template +EIGEN_STRONG_INLINE HVX_Vector vload_variable(const T* mem, const size_t& n, const size_t& offset) { +#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD) + // Fast partial vector load through aligned vmem load. + // The load may past end of array but is aligned to prevent memory fault. + HVX_Vector v0 = HVX_vmem<0>(mem); + HVX_Vector v1 = v0; + uintptr_t mem_addr = reinterpret_cast(mem); + EIGEN_IF_CONSTEXPR(n <= Alignment) { + // Data size less than alignment will never cross multiple aligned vectors. + v1 = v0; + } + else { + uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1); + if (left_off + n > __HVX_LENGTH__) { + v1 = HVX_vmem<1>(mem); + } else { + v1 = v0; + } + } + v0 = Q6_V_valign_VVR(v1, v0, mem_addr); + v0 = Q6_V_valign_VVR(v0, Q6_V_vzero(), n); + return Q6_V_valign_VVR(Q6_V_vzero(), v0, __HVX_LENGTH__ - n - offset * sizeof(T)); +#else + HVX_Vector v; + memcpy(&v, mem, n); + v = Q6_V_valign_VVR(v, Q6_V_vzero(), n); + return Q6_V_valign_VVR(Q6_V_vzero(), v, __HVX_LENGTH__ - n - offset * sizeof(T)); +#endif +} + template EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) { #if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD) @@ -305,6 +368,16 @@ EIGEN_STRONG_INLINE Packet8f ploadu(const float* from) { return Packet8f::Create(HVX_load_partial::size, 0>(from)); } +template <> +EIGEN_STRONG_INLINE Packet32f pload_partial(const float* from, const Index n, const Index offset) { + return Packet32f::Create(vload_variable::alignment>(from, n * sizeof(float), offset)); +} + +template <> +EIGEN_STRONG_INLINE Packet32f ploadu_partial(const float* from, const Index n, const Index offset) { + return Packet32f::Create(vload_variable<0>(from, n * sizeof(float), offset)); +} + template <> EIGEN_STRONG_INLINE void pstore(float* to, const Packet32f& from) { HVX_store(to, from.Get()); @@ -331,6 +404,36 @@ EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet8f& from) { HVX_store_partial::size, 0>(to, from.Get()); } +template <> +EIGEN_STRONG_INLINE void pstore_partial(float* to, const Packet32f& from, const Index n, const Index offset) { + if (offset != 0) { + EIGEN_USING_STD(memcpy); + constexpr Index PacketSize = unpacket_traits::size; + alignas(alignof(Packet32f)) float elements[PacketSize]; + pstore(elements, from); + for (Index i = 0; i < numext::mini(n, PacketSize - offset); i++) { + to[i] = elements[i + offset]; + } + } else { + vstu_variable::alignment, float>(to, sizeof(float) * n, from.Get()); + } +} + +template <> +EIGEN_STRONG_INLINE void pstoreu_partial(float* to, const Packet32f& from, const Index n, const Index offset) { + if (offset != 0) { + EIGEN_USING_STD(memcpy); + constexpr Index PacketSize = unpacket_traits::size; + alignas(alignof(Packet32f)) float elements[PacketSize]; + pstore(elements, from); + for (Index i = 0; i < numext::mini(n, PacketSize - offset); i++) { + to[i] = elements[i + offset]; + } + } else { + vstu_variable<0, float>(to, sizeof(float) * n, from.Get()); + } +} + template EIGEN_STRONG_INLINE HVXPacket pmul_hvx(const HVXPacket& a, const HVXPacket& b) { return HVXPacket::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get()))); @@ -777,10 +880,12 @@ EIGEN_STRONG_INLINE HVXPacket ploaddup_hvx(const float* from) { HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4); return HVXPacket::Create(HEXAGON_HVX_GET_V0(dup)); } + template <> EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) { return ploaddup_hvx(from); } + template <> EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { return ploaddup_hvx(from); @@ -829,6 +934,12 @@ EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) { return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta)); } +template <> +EIGEN_STRONG_INLINE Packet32f preverse_partial(const Packet32f& a, const Index n) { + Packet32f reverse_a = preverse(a); + return Packet32f::Create(Q6_V_vlalign_VVR(reverse_a.Get(), reverse_a.Get(), n * sizeof(float))); +} + template EIGEN_STRONG_INLINE HVXPacket pmin_hvx(const HVXPacket& a, const HVXPacket& b) { return HVXPacket::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get())); diff --git a/Eigen/src/Core/functors/AssignmentFunctors.h b/Eigen/src/Core/functors/AssignmentFunctors.h index 09d1da8ca2bcb41384520f46e2b793ba8b28a798..7d7a9984b316e93beed32ef057cd27fadce753f0 100644 --- a/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/Eigen/src/Core/functors/AssignmentFunctors.h @@ -29,6 +29,10 @@ struct assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, b); } + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, Index n, Index offset) const { + internal::pstoret_partial(a, b, n, offset); + } }; // Empty overload for void type (used by PermutationMatrix) @@ -56,6 +60,11 @@ struct add_assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, internal::padd(internal::ploadt(a), b)); } + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, Index n, Index offset) const { + internal::pstoret_partial( + a, internal::padd(internal::ploadt_partial(a, n, offset), b), n, offset); + } }; template struct functor_traits > { @@ -77,6 +86,11 @@ struct sub_assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, internal::psub(internal::ploadt(a), b)); } + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, Index n, Index offset) const { + internal::pstoret_partial( + a, internal::psub(internal::ploadt_partial(a, n, offset), b), n, offset); + } }; template struct functor_traits > { @@ -98,6 +112,11 @@ struct mul_assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, internal::pmul(internal::ploadt(a), b)); } + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, Index n, Index offset) const { + internal::pstoret_partial( + a, internal::pmul(internal::ploadt_partial(a, n, offset), b), n, offset); + } }; template struct functor_traits > { @@ -119,6 +138,11 @@ struct div_assign_op { EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const { internal::pstoret(a, internal::pdiv(internal::ploadt(a), b)); } + template + EIGEN_STRONG_INLINE void assignPartialPacket(DstScalar* a, const Packet& b, Index n, Index offset) const { + internal::pstoret_partial( + a, internal::pdiv(internal::ploadt_partial(a, n, offset), b), n, offset); + } }; template struct functor_traits > { diff --git a/Eigen/src/Core/functors/NullaryFunctors.h b/Eigen/src/Core/functors/NullaryFunctors.h index c53bb9073b21b6ddb2a2b676d86ae9d75a4afdcc..3bc9db0c4284fe36140f85b91a305f6787299c50 100644 --- a/Eigen/src/Core/functors/NullaryFunctors.h +++ b/Eigen/src/Core/functors/NullaryFunctors.h @@ -77,15 +77,13 @@ struct linspaced_op_impl { // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) if (m_flip) { Packet pi = plset(Scalar(i - m_size1)); - Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); - if (EIGEN_PREDICT_TRUE(i != 0)) return res; - Packet mask = pcmp_lt(pset1(0), plset(0)); + Packet res = pmadd(pset1(m_step), pi, pset1(m_high)); + Packet mask = pcmp_lt(pzero(res), plset(Scalar(i))); return pselect(mask, res, pset1(m_low)); } else { Packet pi = plset(Scalar(i)); - Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); - if (EIGEN_PREDICT_TRUE(i != m_size1 - unpacket_traits::size + 1)) return res; - Packet mask = pcmp_lt(plset(0), pset1(unpacket_traits::size - 1)); + Packet res = pmadd(pset1(m_step), pi, pset1(m_low)); + Packet mask = pcmp_lt(pi, pset1(Scalar(m_size1))); return pselect(mask, res, pset1(m_high)); } } diff --git a/Eigen/src/Core/products/GeneralMatrixVector.h b/Eigen/src/Core/products/GeneralMatrixVector.h index ba72a8a4fbe2fdd87c01ba89b676790379b3cdf4..8e2f8c271b55601929e8fe473c23a0a93a184678 100644 --- a/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/Eigen/src/Core/products/GeneralMatrixVector.h @@ -146,6 +146,16 @@ general_matrix_vector_product(alpha); ResPacketQuarter palpha_quarter = pset1(alpha); +#if defined(EIGEN_VECTORIZE_PARTIAL) + EIGEN_UNUSED_VARIABLE(cj); + EIGEN_UNUSED_VARIABLE(pcj_half); + EIGEN_UNUSED_VARIABLE(pcj_quarter); + EIGEN_UNUSED_VARIABLE(n_half); + EIGEN_UNUSED_VARIABLE(n_quarter); + EIGEN_UNUSED_VARIABLE(palpha_half); + EIGEN_UNUSED_VARIABLE(palpha_quarter); +#endif // EIGEN_VECTORIZE_PARTIAL + for (Index j2 = 0; j2 < cols; j2 += block_cols) { Index jend = numext::mini(j2 + block_cols, cols); Index i = 0; @@ -230,6 +240,19 @@ general_matrix_vector_product(res + i + ResPacketSize * 0))); i += ResPacketSize; } +#if defined(EIGEN_VECTORIZE_PARTIAL) + if (i < rows) { + Index leftover = rows - i; + ResPacket c0 = pset1(ResScalar(0)); + for (Index j = j2; j < jend; j += 1) { + RhsPacket b0 = pset1(rhs(j, 0)); + c0 = pcj.pmadd(lhs.template loadPartial(i + 0, j, leftover, 0), b0, c0); + } + pstoreu_partial(res + i + ResPacketSize * 0, + pmadd(c0, palpha, ploadu_partial(res + i + ResPacketSize * 0, leftover, 0)), leftover, + 0); + } +#else if (HasHalf && i < n_half) { ResPacketHalf c0 = pset1(ResScalar(0)); for (Index j = j2; j < jend; j += 1) { @@ -255,6 +278,7 @@ general_matrix_vector_product(m_data + i); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPartial(Index i, Index n, Index offset = 0) const { + return ploadt_partial(m_data + i, n, offset); + } + template EIGEN_DEVICE_FUNC bool aligned(Index i) const { return (std::uintptr_t(m_data + i) % sizeof(Packet)) == 0; @@ -103,6 +108,11 @@ class BlasLinearMapper { return ploadt(m_data + i); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPartial(Index i, Index n, Index offset = 0) const { + return ploadt_partial(m_data + i, n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType& p) const { pstoret(m_data + i, p); @@ -218,6 +228,11 @@ class blas_data_mapper { return ploadt(&operator()(i, j)); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPartial(Index i, Index j, Index n, Index offset = 0) const { + return ploadt_partial(&operator()(i, j), n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Index j, const PacketType& p) const { pstoret(&operator()(i, j), p); @@ -339,6 +354,11 @@ class blas_data_mapper { return pgather(&operator()(i, j), m_incr.value()); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPartial(Index i, Index j, Index n, Index /*offset*/ = 0) const { + return pgather_partial(&operator()(i, j), m_incr.value(), n); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Index j, const PacketType& p) const { pscatter(&operator()(i, j), p, m_incr.value()); diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 47ddd4f8ae33405f2a6600dc33bd3d07a668e63f..b5bcf8cc2d46f811b59c0f8f99420a47b245a5b6 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -433,6 +433,7 @@ extern "C" { #define EIGEN_VECTORIZE #define EIGEN_VECTORIZE_HVX +#define EIGEN_VECTORIZE_PARTIAL #include #endif @@ -519,6 +520,8 @@ inline static const char *SimdInstructionSetsInUse(void) { return "S390X ZVECTOR"; #elif defined(EIGEN_VECTORIZE_MSA) return "MIPS MSA"; +#elif defined(EIGEN_VECTORIZE_HVX) + return "HEXAGON HVX"; #else return "None"; #endif diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index 226844f6456484f8197383f68ba5238661f85f16..a9706cf0cae727b1b37b3d853f98899ae17661f0 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -508,6 +508,13 @@ #define EIGEN_OS_SOLARIS 0 #endif +/// \internal EIGEN_OS_QURT set to 1 if the OS is Qualcomm DSP QURT +#if defined(__QDSP6_RTOS__) +#define EIGEN_OS_QURT 1 +#else +#define EIGEN_OS_QURT 0 +#endif + //------------------------------------------------------------------------------------------ // Detect GPU compilers and architectures //------------------------------------------------------------------------------------------ diff --git a/ci/hexagon.dockerfile b/ci/hexagon.dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..15144b3ba305ff585bcb466ffc42dbaf1cb39f83 --- /dev/null +++ b/ci/hexagon.dockerfile @@ -0,0 +1,58 @@ +# see URLs below to create QPM login, download QPM deb package, and sign agreements needed to build this image +# build image with your Qualcomm credentials via "docker build --build-arg QPM_USER=foo --build-arg QPM_PASS=bar -t eigen-hex -f hexagon.dockerfile ." +# run eigen test suite with hexagon simulator via "docker run --rm -it eigen-hex -- -j 32" + +FROM ubuntu:22.04 +# create a qualcomm account at https://myaccount.qualcomm.com/signup +ARG QPM_USER +ARG QPM_PASS + +# override default eigen url/branch for testing MR +ARG REPO_URL=https://gitlab.com/libeigen/eigen.git +ARG REPO_BRANCH=master + +# install qpm dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends sudo bc libicu70 libsasl2-2 libsqlite3-0 librtmp1 ca-certificates && \ + rm -rf /var/lib/apt/lists/* + +# install qpm +# download QPM from https://qpm.qualcomm.com/#/main/tools/details/QPM3 +ADD qpm3.deb / +RUN dpkg -i qpm3.deb + +# login to qpm and install hexagon sdk 6.x +# sign agreements at https://www.qualcomm.com/agreements +# hexagon installs to /local/mnt/workspace/Qualcomm/... +RUN mkdir -p /local/mnt/workspace +# hexagon installer uses /usr/bin/python +# hexagon installer needs unzip to unpack android ndk +RUN apt-get update && \ + apt-get install -y --no-install-recommends python-is-python3 unzip && \ + rm -rf /var/lib/apt/lists/* +RUN qpm-cli --login $QPM_USER $QPM_PASS && \ + qpm-cli --license-activate hexagonsdk6.x && \ + echo y | qpm-cli --install hexagonsdk6.x && \ + rm -rf /tmp/* + +# install hexagon-sim dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends libncurses5 libtinfo5 libatomic1 && \ + rm -rf /var/lib/apt/lists/* + +# install eigen dependencies +RUN apt-get update && \ + apt-get install -y --no-install-recommends git cmake make && \ + rm -rf /var/lib/apt/lists/* + +# clone repo, compile tests +SHELL ["/bin/bash", "-c"] +RUN git clone --filter=blob:none -b $REPO_BRANCH $REPO_URL /eigen && \ + mkdir /build && \ + cd /build &&\ + source /local/mnt/workspace/Qualcomm/Hexagon_SDK/6.*/setup_sdk_env.source && \ + cmake ../eigen -DCMAKE_TOOLCHAIN_FILE=../eigen/cmake/HexagonToolchain.cmake -DBUILD_TESTING=ON && \ + make -j 40 buildtests + +WORKDIR /build +ENTRYPOINT ["ctest"] \ No newline at end of file diff --git a/cmake/EigenTesting.cmake b/cmake/EigenTesting.cmake index a1488fc4762201d291fca37fd9e42c0f8f17189f..81d9cfe04777c4065e3e2604d8ed6cb3e34cd094 100644 --- a/cmake/EigenTesting.cmake +++ b/cmake/EigenTesting.cmake @@ -75,8 +75,12 @@ macro(ei_add_test_internal testname testname_with_suffix) # let the user pass flags. if(${ARGC} GREATER 2) - separate_arguments(compile_options NATIVE_COMMAND ${ARGV2}) - target_compile_options(${targetname} PRIVATE ${compile_options}) + if(HEXAGON) + target_compile_options(${targetname} PRIVATE "SHELL:${ARGV2}") + else() + separate_arguments(compile_options NATIVE_COMMAND ${ARGV2}) + target_compile_options(${targetname} PRIVATE ${compile_options}) + endif() endif() if(EIGEN_TEST_CUSTOM_CXX_FLAGS) @@ -107,7 +111,11 @@ macro(ei_add_test_internal testname testname_with_suffix) endif() endif() - add_test(${testname_with_suffix} "${targetname}") + if(HEXAGON) + add_test(NAME ${testname_with_suffix} COMMAND "${targetname}") + else() + add_test(${testname_with_suffix} "${targetname}") + endif() # Specify target and test labels according to EIGEN_CURRENT_SUBPROJECT get_property(current_subproject GLOBAL PROPERTY EIGEN_CURRENT_SUBPROJECT) diff --git a/cmake/HexagonToolchain.cmake b/cmake/HexagonToolchain.cmake new file mode 100644 index 0000000000000000000000000000000000000000..97fa476c0905a28a61dd06f91fd0e253d2ceb977 --- /dev/null +++ b/cmake/HexagonToolchain.cmake @@ -0,0 +1,153 @@ + +if(HEXAGON_TOOLCHAIN_INCLUDED) + return() +endif(HEXAGON_TOOLCHAIN_INCLUDED) +set(HEXAGON_TOOLCHAIN_INCLUDED true) + +set(EIGEN_TEST_HVX ON) + +if (NOT DSP_VERSION) + set(DSP_VERSION v69) +endif() + +set(TOOLS_VARIANT $ENV{DEFAULT_TOOLS_VARIANT}) +set(PREBUILT_LIB_DIR hexagon_${TOOLS_VARIANT}_${DSP_VERSION}) + +# Cross Compiling for Hexagon +set(HEXAGON TRUE) +set(CMAKE_SYSTEM_NAME QURT) +set(CMAKE_SYSTEM_PROCESSOR Hexagon) +set(CMAKE_SYSTEM_VERSION "1") #${HEXAGON_PLATFORM_LEVEL}) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) +set(CUSTOM_RUNELF_PATH "") + +# To fix backward compatibility with EAI addon. +if (NOT HEXAGON_SDK_ROOT) + set(HEXAGON_SDK_ROOT $ENV{HEXAGON_SDK_ROOT}) +endif() + +if (NOT HEXAGON_TOOLS_ROOT) + if (DEFINED ENV{HEXAGON_TOOLS_ROOT}) + set(HEXAGON_TOOLS_ROOT $ENV{HEXAGON_TOOLS_ROOT}) + endif() + if(NOT HEXAGON_TOOLS_ROOT) + set(HEXAGON_TOOLS_ROOT $ENV{DEFAULT_HEXAGON_TOOLS_ROOT}) + endif() +endif() + +file(TO_CMAKE_PATH "${HEXAGON_TOOLS_ROOT}" HEXAGON_TOOLS_ROOT) +file(TO_CMAKE_PATH "${HEXAGON_SDK_ROOT}" HEXAGON_SDK_ROOT) + +include(${HEXAGON_SDK_ROOT}/build/cmake/hexagon_arch.cmake) + +set(HEXAGON_TOOLCHAIN ${HEXAGON_TOOLS_ROOT}) +set(HEXAGON_LIB_DIR "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib") +set(HEXAGON_ISS_DIR ${HEXAGON_TOOLCHAIN}/Tools/lib/iss) +set(RUN_MAIN_HEXAGON "${HEXAGON_SDK_ROOT}/libs/run_main_on_hexagon/ship/${PREBUILT_LIB_DIR}/run_main_on_hexagon_sim") + +set(CMAKE_TRY_COMPILE_PLATFORM_VARIABLES + HEXAGON_SDK_ROOT + HEXAGON_TOOLS_ROOT +) + +#QURT SPECIFIC LIBS and Includes +# Linker Flags +# QURT Related includes and linker flags + +set(V_ARCH ${HEXAGON_ARCH}) +set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/ADSP${V_ARCH}MP${V_ARCH_EXTN}") +set(_QURT_INSTALL_DIR "${HEXAGON_SDK_ROOT}/rtos/qurt/compute${V_ARCH}${V_ARCH_EXTN}") + +message(DEBUG "_QURT_INSTALL_DIR:${_QURT_INSTALL_DIR}") +set(RTOS_DIR ${_QURT_INSTALL_DIR}) +set(TARGET_DIR "${HEXAGON_LIB_DIR}/${V_ARCH}/G0") +include_directories( + ${_QURT_INSTALL_DIR}/include + ${_QURT_INSTALL_DIR}/include/qurt + ${_QURT_INSTALL_DIR}/include/posix + ) + +# Non QURT related includes and linker flags +set(TARGET_DIR_NOOS "${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/${HEXAGON_ARCH}") + +set(EXE_LD_FLAGS + -m${V_ARCH} + -G0 + -fpic + -Wl,-Bsymbolic + -Wl,-L${TARGET_DIR_NOOS}/G0/pic + -Wl,-L${HEXAGON_TOOLCHAIN}/Tools/target/hexagon/lib/ + -Wl,--no-threads -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign + -shared + "-o " + "" + -Wl,--start-group + "" + "" + -Wl,${TARGET_DIR_NOOS}/G0/pic/libc++.a + -Wl,${TARGET_DIR_NOOS}/G0/pic/libc++abi.a + -Wl,--end-group + -lc + ) + STRING(REPLACE ";" " " EXE_LD_FLAGS "${EXE_LD_FLAGS}") + +set(HEXAGON_C_LINK_EXECUTABLE_LINK_OPTIONS "${EXE_LD_FLAGS}" ) +message(DEBUG "Hexagon C Executable Linker Line:${HEXAGON_C_LINK_EXECUTABLE_LINK_OPTIONS}") +set(HEXAGON_CXX_LINK_EXECUTABLE_LINK_OPTIONS "${EXE_LD_FLAGS}") +message(DEBUG "Hexagon CXX Executable Linker Line:${HEXAGON_CXX_LINK_EXECUTABLE_LINK_OPTIONS}") + +# System include paths +include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs) +include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/incs/stddef) +include_directories(SYSTEM ${HEXAGON_SDK_ROOT}/ipc/fastrpc/incs) + +# LLVM toolchain setup +# Compiler paths, options and architecture +set(CMAKE_C_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang${HEXAGON_TOOLCHAIN_SUFFIX}) +set(CMAKE_CXX_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX}) +set(CMAKE_AR ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-ar${HEXAGON_TOOLCHAIN_SUFFIX}) +set(CMAKE_ASM_COMPILER ${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-clang++${HEXAGON_TOOLCHAIN_SUFFIX}) +set(HEXAGON_LINKER ${CMAKE_C_COMPILER}) +set(CMAKE_PREFIX_PATH ${HEXAGON_TOOLCHAIN}/Tools/target/hexagon) +set(HEXAGON_SIM "${HEXAGON_TOOLCHAIN}/Tools/bin/hexagon-sim${HEXAGON_TOOLCHAIN_SUFFIX}") +set(DEBUG_FLAGS "-O0 -g") +set(RELEASE_FLAGS "-O2") +set(COMMON_FLAGS "-m${HEXAGON_ARCH} -G0 -Wall -fno-zero-initialized-in-bss -fdata-sections -fpic") + +set(COMMON_FLAGS "${COMMON_FLAGS} -mhvx -mhvx-length=128B") + +set(CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Wl,-soname,") +set(CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Wl,-soname,") + +set(CMAKE_CXX_FLAGS_DEBUG "${COMMON_FLAGS} ${DEBUG_FLAGS}") +set(CMAKE_CXX_FLAGS_RELEASE "${COMMON_FLAGS} ${RELEASE_FLAGS}") +set(CMAKE_C_FLAGS_DEBUG "${COMMON_FLAGS} ${DEBUG_FLAGS}") +set(CMAKE_C_FLAGS_RELEASE "${COMMON_FLAGS} ${RELEASE_FLAGS}") +if(ADD_SYMBOLS) + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -g ") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -g ") +endif() +set(CMAKE_ASM_FLAGS_DEBUG "${CMAKE_ASM_FLAGS_DEBUG} ${CMAKE_CXX_FLAGS_DEBUG}") +set(CMAKE_ASM_FLAGS_RELEASE "${CMAKE_ASM_FLAGS_RELEASE} ${CMAKE_CXX_FLAGS_RELEASE}") + +# Linker Options +set(CMAKE_C_LINK_EXECUTABLE "${HEXAGON_LINKER} ${HEXAGON_C_LINK_EXECUTABLE_LINK_OPTIONS}") +set(CMAKE_C_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}") +set(CMAKE_CXX_LINK_EXECUTABLE "${HEXAGON_LINKER} ${HEXAGON_CXX_LINK_EXECUTABLE_LINK_OPTIONS}") +set(CMAKE_CXX_CREATE_SHARED_LIBRARY "${HEXAGON_LINKER} ${HEXAGON_PIC_SHARED_LINK_OPTIONS}") + +# Run simulator +set(CUSTOM_RUNELF_PATH ${RTOS_DIR}/sdksim_bin/runelf.pbn) + +set(q6ssLine1 "${HEXAGON_ISS_DIR}/qtimer.so --csr_base=0xFC900000 --irq_p=1 --freq=19200000 --cnttid=1\n") +set(q6ssLine2 "${HEXAGON_ISS_DIR}/l2vic.so 32 0xFC910000\n") +set(osamString "${RTOS_DIR}/debugger/lnx64/qurt_model.so\n") +file(WRITE ${CMAKE_BINARY_DIR}/q6ss.cfg ${q6ssLine1}) +file(APPEND ${CMAKE_BINARY_DIR}/q6ss.cfg ${q6ssLine2}) +file(WRITE ${CMAKE_BINARY_DIR}/osam.cfg ${osamString}) + +set(CMAKE_CROSSCOMPILING_EMULATOR + ${HEXAGON_SIM};-m${SIM_V_ARCH};--simulated_returnval;--usefs;${CMAKE_CURRENT_BINARY_DIR};--cosim_file;${CMAKE_BINARY_DIR}/q6ss.cfg;--l2tcm_base;0xd800;--rtos;${CMAKE_BINARY_DIR}/osam.cfg;${CUSTOM_RUNELF_PATH};--;${RUN_MAIN_HEXAGON};--) diff --git a/test/main.h b/test/main.h index c7cc531dc3dcc398fd3570431e9456b1f8ae207e..ee47e83934333717dd2c2a7e6c17b24fa21478ae 100644 --- a/test/main.h +++ b/test/main.h @@ -84,6 +84,11 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #endif +#if defined __HVX__ && (__HVX_LENGTH__ == 128) +// Need to prevent conflict FORBIDDEN_IDENTIFIER B0. +#include +#endif + // To test that all calls from Eigen code to std::min() and std::max() are // protected by parenthesis against macro expansion, the min()/max() macros // are defined here and any not-parenthesized min/max call will cause a diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h index 6367db96409ca2c5c469640992460b505c5a05d6..51b0711bef4f92774e6ec4b1b9f9011bcee118dd 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h @@ -313,10 +313,35 @@ class BaseTensorContractionMapper return pload(data); } + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPartial(Index i, Index j, Index n, Index offset = 0) const { + const Index requested_packet_size = internal::unpacket_traits::size; + EIGEN_ALIGN_MAX Scalar data[requested_packet_size]; + + const IndexPair indexPair = this->computeIndexPair(i, j, requested_packet_size - 1); + const Index first = indexPair.first; + const Index lastIdx = indexPair.second; + + data[0] = this->m_tensor.coeff(first); + for (Index k = 1; k < requested_packet_size - 1; k += 2) { + const IndexPair internal_pair = this->computeIndexPair(i + k, j, 1); + data[k] = this->m_tensor.coeff(internal_pair.first); + data[k + 1] = this->m_tensor.coeff(internal_pair.second); + } + data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx); + + return pload_partial(data, n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const { return this->load(i, j); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPartialPacket(Index i, Index j, Index n, Index offset = 0) const { + return this->loadPartial(i, j, n, offset); + } }; template m_tensor.coeff(this->computeIndex(i, j)); return pload(data); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPartialPacket(Index i, Index j, Index n, Index offset = 0) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload_partial(data, n, offset); + } + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const { EIGEN_ALIGN_MAX Scalar data[1]; data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); return pload(data); } + + template + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPartial(Index i, Index j, Index n, Index offset = 0) const { + EIGEN_ALIGN_MAX Scalar data[1]; + data[0] = this->m_tensor.coeff(this->computeIndex(i, j)); + return pload_partial(data, n, offset); + } }; template (i + m_vert_offset, m_horiz_offset); } + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPartial(Index i, Index n, Index offset = 0) const { + EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MADE_A_PROGRAMMING_MISTAKE); + const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned; + if (UseDirectOffsets) { + return m_base_mapper.template loadPartialPacket(i, 0, n, offset); + } + return m_base_mapper.template loadPartialPacket(i + m_vert_offset, m_horiz_offset, n, + offset); + } + template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const { return false; diff --git a/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h index 15d7fb2def473abbc347566cc21545e02d507976..434a58a50b4a70b775f08d83b27ed9c4439c1fa5 100644 --- a/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h +++ b/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h @@ -171,10 +171,14 @@ class MatrixMarketIterator { std::string curfile; curfile = m_folder + "/" + m_curs_id->d_name; // Discard if it is a folder + +#if EIGEN_OS_QURT // EIGEN_OS_QURT didn't define DT_DIR + struct stat st_buf; + stat(curfile.c_str(), &st_buf); + if (S_ISDIR(st_buf.st_mode)) continue; +#else if (m_curs_id->d_type == DT_DIR) continue; // FIXME This may not be available on non BSD systems - // struct stat st_buf; - // stat (curfile.c_str(), &st_buf); - // if (S_ISDIR(st_buf.st_mode)) continue; +#endif // Determine from the header if it is a matrix or a right hand side bool isvector, iscomplex = false;