diff --git a/Eigen/Core b/Eigen/Core index 5f46dde21f7352089c8ab68537da23917b551c85..94fd6ecc0cd2992548a5ed69895b92f0c7e22063 100644 --- a/Eigen/Core +++ b/Eigen/Core @@ -277,6 +277,15 @@ using std::ptrdiff_t; #include "src/Core/arch/SVE/PacketMath.h" #include "src/Core/arch/SVE/TypeCasting.h" #include "src/Core/arch/SVE/MathFunctions.h" +#elif defined EIGEN_VECTORIZE_RVV10 +#include "src/Core/arch/RVV10/PacketMath.h" +#include "src/Core/arch/RVV10/PacketMath4.h" +#include "src/Core/arch/RVV10/PacketMath2.h" +#include "src/Core/arch/RVV10/TypeCasting.h" +#include "src/Core/arch/RVV10/MathFunctions.h" +#if defined EIGEN_VECTORIZE_RVV10FP16 +#include "src/Core/arch/RVV10/PacketMathFP16.h" +#endif #elif defined EIGEN_VECTORIZE_ZVECTOR #include "src/Core/arch/ZVector/PacketMath.h" #include "src/Core/arch/ZVector/MathFunctions.h" diff --git a/Eigen/src/Core/arch/RVV10/MathFunctions.h b/Eigen/src/Core/arch/RVV10/MathFunctions.h new file mode 100644 index 0000000000000000000000000000000000000000..10a70c446ce542bada8113e5985079fdd0655fe5 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/MathFunctions.h @@ -0,0 +1,30 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_MATH_FUNCTIONS_RVV10_H +#define EIGEN_MATH_FUNCTIONS_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet1Xf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet2Xf) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet4Xf) + +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet1Xd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet2Xd) +EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet4Xd) + +} // end namespace internal +} // end namespace Eigen + +#endif // EIGEN_MATH_FUNCTIONS_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath.h b/Eigen/src/Core/arch/RVV10/PacketMath.h new file mode 100644 index 0000000000000000000000000000000000000000..54db62634bfea1742fcd5befc7df26c83b709c2b --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMath.h @@ -0,0 +1,2395 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_RVV10_H +#define EIGEN_PACKET_MATH_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { +#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD +#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8 +#endif + +#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD +#endif + +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 + +template +struct rvv_packet_size_selector { + enum { size = VectorLength * VectorLMul / (sizeof(Scalar) * CHAR_BIT) }; +}; + +template +struct rvv_packet_alignment_selector { + enum { + alignment = + (VectorLength * VectorLMul) >= 1024 + ? Aligned128 + : ((VectorLength * VectorLMul) >= 512 ? Aligned64 + : ((VectorLength * VectorLMul) >= 256 ? Aligned32 : Aligned16)) + }; +}; + +typedef vbool64_t PacketMask64; +typedef vbool32_t PacketMask32; +typedef vbool16_t PacketMask16; +typedef vbool8_t PacketMask8; +typedef vbool4_t PacketMask4; + +/********************************* int32 **************************************/ +typedef eigen_packet_wrapper Packet1Xi; +typedef eigen_packet_wrapper Packet1Xu; + +typedef eigen_packet_wrapper + Packet2Xi; +typedef eigen_packet_wrapper + Packet2Xu; + +typedef eigen_packet_wrapper + Packet4Xi; +typedef eigen_packet_wrapper + Packet4Xu; + +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef Packet1Xi PacketXi; +typedef Packet1Xu PacketXu; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet1Xi type; + typedef Packet1Xi half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef Packet2Xi PacketXi; +typedef Packet2Xu PacketXu; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2Xi type; + typedef Packet1Xi half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef Packet4Xi PacketXi; +typedef Packet4Xu PacketXu; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4Xi type; + typedef Packet2Xi half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; +#endif + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef Packet1Xi half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef Packet1Xi half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int32_t type; + typedef Packet2Xi half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int32_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* Packet1Xi ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet1Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi plset(const numext::int32_t& a) { + Packet1Xi idx = __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)); + return __riscv_vadd_vx_i32m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pzero(const Packet1Xi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi padd(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vadd_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi psub(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pnegate(const Packet1Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pconj(const Packet1Xi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pmul(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pdiv(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pmadd(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pmsub(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pnmadd(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pnmsub(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c) { + return __riscv_vnmsub_vv_i32m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pmin(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pmax(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pcmp_le(const Packet1Xi& a, const Packet1Xi& b) { + PacketMask32 mask = __riscv_vmsle_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pcmp_lt(const Packet1Xi& a, const Packet1Xi& b) { + PacketMask32 mask = __riscv_vmslt_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pcmp_eq(const Packet1Xi& a, const Packet1Xi& b) { + PacketMask32 mask = __riscv_vmseq_vv_i32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m1(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi ptrue(const Packet1Xi& /*a*/) { + return __riscv_vmv_v_x_i32m1(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pand(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vand_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi por(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vor_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pxor(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vxor_vv_i32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pandnot(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vand_vv_i32m1(a, __riscv_vnot_v_i32m1(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet1Xi parithmetic_shift_right(Packet1Xi a) { + return __riscv_vsra_vx_i32m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet1Xi plogical_shift_right(Packet1Xi a) { + return __riscv_vreinterpret_i32m1( + __riscv_vsrl_vx_u32m1(__riscv_vreinterpret_u32m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE Packet1Xi plogical_shift_left(Packet1Xi a) { + return __riscv_vsll_vx_i32m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi ploaddup(const numext::int32_t* from) { + Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi ploadquad(const numext::int32_t* from) { + Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const Packet1Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Packet1Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1Xi pgather(const numext::int32_t* from, Index stride) { + return __riscv_vlse32_v_i32m1(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const Packet1Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const Packet1Xi& a) { + return __riscv_vmv_x_s_i32m1_i32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi preverse(const Packet1Xi& a) { + Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pabs(const Packet1Xi& a) { + Packet1Xi mask = __riscv_vsra_vx_i32m1(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const Packet1Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const Packet1Xi& a) { + // Multiply the vector by its reverse + Packet1Xi prod = __riscv_vmul_vv_i32m1(preverse(a), a, unpacket_traits::size); + Packet1Xi half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_i32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i32m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const Packet1Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m1_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const Packet1Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m1_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* float32 ************************************/ + +typedef eigen_packet_wrapper Packet1Xf; +typedef eigen_packet_wrapper + Packet2Xf; +typedef eigen_packet_wrapper + Packet4Xf; + +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef Packet1Xf PacketXf; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet1Xf type; + typedef Packet1Xf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef Packet2Xf PacketXf; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2Xf type; + typedef Packet1Xf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef Packet4Xf PacketXf; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4Xf type; + typedef Packet2Xf half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 1, + HasExp = 1, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = EIGEN_FAST_MATH + }; +}; +#endif + +template <> +struct unpacket_traits { + typedef float type; + typedef Packet1Xf half; // Half not yet implemented + typedef Packet1Xi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask32 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef Packet1Xf half; + typedef Packet2Xi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef float type; + typedef Packet2Xf half; + typedef Packet4Xi integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask8 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* Packet1Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet1Xf ptrue(const Packet1Xf& /*a*/) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pzero(const Packet1Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m1(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pabs(const Packet1Xf& a) { + return __riscv_vfabs_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m1(__riscv_vmv_v_x_u32m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf plset(const float& a) { + Packet1Xf idx = __riscv_vfcvt_f_x_v_f32m1( + __riscv_vreinterpret_v_u32m1_i32m1(__riscv_vid_v_u32m1(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf padd(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfadd_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf psub(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfsub_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pnegate(const Packet1Xf& a) { + return __riscv_vfneg_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pconj(const Packet1Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pmul(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfmul_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pdiv(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfdiv_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pmadd(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) { + return __riscv_vfmadd_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pmsub(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) { + return __riscv_vfmsub_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pnmadd(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) { + return __riscv_vfnmsub_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pnmsub(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c) { + return __riscv_vfnmadd_vv_f32m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pmin(const Packet1Xf& a, const Packet1Xf& b) { + Packet1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pmin(const Packet1Xf& a, const Packet1Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pmin(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfmin_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pmax(const Packet1Xf& a, const Packet1Xf& b) { + Packet1Xf nans = __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f32m1_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m1_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pmax(const Packet1Xf& a, const Packet1Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pmax(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vfmax_vv_f32m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pcmp_le(const Packet1Xf& a, const Packet1Xf& b) { + PacketMask32 mask = __riscv_vmfle_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pcmp_lt(const Packet1Xf& a, const Packet1Xf& b) { + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pcmp_eq(const Packet1Xf& a, const Packet1Xf& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pcmp_lt_or_nan(const Packet1Xf& a, const Packet1Xf& b) { + PacketMask32 mask = __riscv_vmfge_vv_f32m1_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m1(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE Packet1Xf pand(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf por(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pxor(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vxor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pandnot(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vnot_v_u32m1(__riscv_vreinterpret_v_f32m1_u32m1(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf ploaddup(const float* from) { + Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m1(__riscv_vand_vx_u32m1(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf ploadquad(const float* from) { + Packet1Xu idx = __riscv_vid_v_u32m1(unpacket_traits::size); + idx = __riscv_vand_vx_u32m1(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet1Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet1Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m1(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet1Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet1Xf& a) { + return __riscv_vfmv_f_s_f32m1_f32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf psqrt(const Packet1Xf& a) { + return __riscv_vfsqrt_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf print(const Packet1Xf& a) { + const Packet1Xf limit = pset1(static_cast(1 << 23)); + const Packet1Xf abs_a = pabs(a); + + PacketMask32 mask = __riscv_vmfne_vv_f32m1_b32(a, a, unpacket_traits::size); + const Packet1Xf x = __riscv_vfadd_vv_f32m1_tumu(mask, a, a, a, unpacket_traits::size); + const Packet1Xf new_x = __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m1_b32(abs_a, limit, unpacket_traits::size); + Packet1Xf signed_x = __riscv_vfsgnj_vv_f32m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m1(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pfloor(const Packet1Xf& a) { + Packet1Xf tmp = print(a); + // If greater, subtract one. + PacketMask32 mask = __riscv_vmflt_vv_f32m1_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m1_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf preverse(const Packet1Xf& a) { + Packet1Xu idx = __riscv_vrsub_vx_u32m1(__riscv_vid_v_u32m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pfrexp(const Packet1Xf& a, Packet1Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const Packet1Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m1_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet1Xf& a) { + // Multiply the vector by its reverse + Packet1Xf prod = __riscv_vfmul_vv_f32m1(preverse(a), a, unpacket_traits::size); + Packet1Xf half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f32m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f32m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f32m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet1Xf& a) { + return ( + std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m1_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet1Xf& a) { + return ( + std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m1_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf pldexp(const Packet1Xf& a, const Packet1Xf& exponent) { + return pldexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE PacketMask32 por(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmor_mm_b32(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask32 pand(const PacketMask32& a, const PacketMask32& b) { + return __riscv_vmand_mm_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask32 pcmp_eq_mask(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vmfeq_vv_f32m1_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask32 pcmp_lt_mask(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vmflt_vv_f32m1_b32(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE Packet1Xf pselect(const PacketMask32& mask, const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vmerge_vvm_f32m1(b, a, mask, unpacket_traits::size); +} + +/********************************* int64 **************************************/ + +typedef eigen_packet_wrapper Packet1Xl; +typedef eigen_packet_wrapper Packet1Xul; + +typedef eigen_packet_wrapper + Packet2Xl; +typedef eigen_packet_wrapper + Packet2Xul; + +typedef eigen_packet_wrapper + Packet4Xl; +typedef eigen_packet_wrapper + Packet4Xul; + +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef Packet1Xl PacketXl; +typedef Packet1Xul PacketXul; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet1Xl type; + typedef Packet1Xl half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef Packet2Xl PacketXl; +typedef Packet2Xul PacketXul; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2Xl type; + typedef Packet1Xl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef Packet4Xl PacketXl; +typedef Packet4Xul PacketXul; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4Xl type; + typedef Packet2Xl half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; +#endif + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef Packet1Xl half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef Packet1Xl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int64_t type; + typedef Packet2Xl half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int64_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* Packet1Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet1Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl plset(const numext::int64_t& a) { + Packet1Xl idx = __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)); + return __riscv_vadd_vx_i64m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pzero(const Packet1Xl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl padd(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vadd_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl psub(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pnegate(const Packet1Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pconj(const Packet1Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pmul(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pdiv(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pmadd(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pmsub(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pnmadd(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pnmsub(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c) { + return __riscv_vnmsub_vv_i64m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pmin(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pmax(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pcmp_le(const Packet1Xl& a, const Packet1Xl& b) { + PacketMask64 mask = __riscv_vmsle_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pcmp_lt(const Packet1Xl& a, const Packet1Xl& b) { + PacketMask64 mask = __riscv_vmslt_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pcmp_eq(const Packet1Xl& a, const Packet1Xl& b) { + PacketMask64 mask = __riscv_vmseq_vv_i64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m1(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl ptrue(const Packet1Xl& /*a*/) { + return __riscv_vmv_v_x_i64m1(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pand(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vand_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl por(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vor_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pxor(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vxor_vv_i64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pandnot(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vand_vv_i64m1(a, __riscv_vnot_v_i64m1(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet1Xl parithmetic_shift_right(Packet1Xl a) { + return __riscv_vsra_vx_i64m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet1Xl plogical_shift_right(Packet1Xl a) { + return __riscv_vreinterpret_i64m1( + __riscv_vsrl_vx_u64m1(__riscv_vreinterpret_u64m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE Packet1Xl plogical_shift_left(Packet1Xl a) { + return __riscv_vsll_vx_i64m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl ploaddup(const numext::int64_t* from) { + Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl ploadquad(const numext::int64_t* from) { + Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; + return __riscv_vloxei64_v_i64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const Packet1Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Packet1Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1Xl pgather(const numext::int64_t* from, Index stride) { + return __riscv_vlse64_v_i64m1(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const Packet1Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const Packet1Xl& a) { + return __riscv_vmv_x_s_i64m1_i64(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl preverse(const Packet1Xl& a) { + Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pabs(const Packet1Xl& a) { + Packet1Xl mask = __riscv_vsra_vx_i64m1(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m1(__riscv_vxor_vv_i64m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const Packet1Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m1_i64m1(a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const Packet1Xl& a) { + // Multiply the vector by its reverse + Packet1Xl prod = __riscv_vmul_vv_i64m1(preverse(a), a, unpacket_traits::size); + Packet1Xl half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i64m1(prod, half_prod, unpacket_traits::size); + } + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const Packet1Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const Packet1Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m1_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* double ************************************/ + +typedef eigen_packet_wrapper Packet1Xd; +typedef eigen_packet_wrapper + Packet2Xd; +typedef eigen_packet_wrapper + Packet4Xd; + +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef Packet1Xd PacketXd; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet1Xd type; + typedef Packet1Xd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef Packet2Xd PacketXd; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2Xd type; + typedef Packet1Xd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef Packet4Xd PacketXd; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4Xd type; + typedef Packet2Xd half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasLog = 1, + HasExp = 1, + HasSqrt = 1 + }; +}; +#endif + +template <> +struct unpacket_traits { + typedef double type; + typedef Packet1Xd half; // Half not yet implemented + typedef Packet1Xl integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask64 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef Packet1Xd half; + typedef Packet2Xl integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask32 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef double type; + typedef Packet2Xd half; + typedef Packet4Xl integer_packet; + typedef numext::uint8_t mask_t; + typedef PacketMask16 packet_mask; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* Packet1Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet1Xd ptrue(const Packet1Xd& /*a*/) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pzero(const Packet1Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pabs(const Packet1Xd& a) { + return __riscv_vfabs_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m1(__riscv_vmv_v_x_u64m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd plset(const double& a) { + Packet1Xd idx = __riscv_vfcvt_f_x_v_f64m1( + __riscv_vreinterpret_v_u64m1_i64m1(__riscv_vid_v_u64m1(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd padd(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfadd_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd psub(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfsub_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pnegate(const Packet1Xd& a) { + return __riscv_vfneg_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pconj(const Packet1Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pmul(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfmul_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pdiv(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfdiv_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pmadd(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) { + return __riscv_vfmadd_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pmsub(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) { + return __riscv_vfmsub_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pnmadd(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) { + return __riscv_vfnmsub_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pnmsub(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c) { + return __riscv_vfnmadd_vv_f64m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pmin(const Packet1Xd& a, const Packet1Xd& b) { + Packet1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pmin(const Packet1Xd& a, const Packet1Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pmin(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfmin_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pmax(const Packet1Xd& a, const Packet1Xd& b) { + Packet1Xd nans = __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, a, unpacket_traits::size); + PacketMask64 mask2 = __riscv_vmfeq_vv_f64m1_b64(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b64(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m1_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pmax(const Packet1Xd& a, const Packet1Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pmax(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vfmax_vv_f64m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pcmp_le(const Packet1Xd& a, const Packet1Xd& b) { + PacketMask64 mask = __riscv_vmfle_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pcmp_lt(const Packet1Xd& a, const Packet1Xd& b) { + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pcmp_eq(const Packet1Xd& a, const Packet1Xd& b) { + PacketMask64 mask = __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pcmp_lt_or_nan(const Packet1Xd& a, const Packet1Xd& b) { + PacketMask64 mask = __riscv_vmfge_vv_f64m1_b64(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m1(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE Packet1Xd pand(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd por(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pxor(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vxor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pandnot(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vnot_v_u64m1(__riscv_vreinterpret_v_f64m1_u64m1(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd ploaddup(const double* from) { + Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd ploadquad(const double* from) { + Packet1Xul idx = __riscv_vid_v_u64m1(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m1(__riscv_vand_vx_u64m1(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + ; + return __riscv_vloxei64_v_f64m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const Packet1Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet1Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m1(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet1Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet1Xd& a) { + return __riscv_vfmv_f_s_f64m1_f64(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd psqrt(const Packet1Xd& a) { + return __riscv_vfsqrt_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd print(const Packet1Xd& a) { + const Packet1Xd limit = pset1(static_cast(1ull << 52)); + const Packet1Xd abs_a = pabs(a); + + PacketMask64 mask = __riscv_vmfne_vv_f64m1_b64(a, a, unpacket_traits::size); + const Packet1Xd x = __riscv_vfadd_vv_f64m1_tumu(mask, a, a, a, unpacket_traits::size); + const Packet1Xd new_x = __riscv_vfcvt_f_x_v_f64m1(__riscv_vfcvt_x_f_v_i64m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m1_b64(abs_a, limit, unpacket_traits::size); + Packet1Xd signed_x = __riscv_vfsgnj_vv_f64m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m1(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pfloor(const Packet1Xd& a) { + Packet1Xd tmp = print(a); + // If greater, subtract one. + PacketMask64 mask = __riscv_vmflt_vv_f64m1_b64(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m1_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd preverse(const Packet1Xd& a) { + Packet1Xul idx = __riscv_vrsub_vx_u64m1(__riscv_vid_v_u64m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pfrexp(const Packet1Xd& a, Packet1Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const Packet1Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m1_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet1Xd& a) { + // Multiply the vector by its reverse + Packet1Xd prod = __riscv_vfmul_vv_f64m1(preverse(a), a, unpacket_traits::size); + Packet1Xd half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f64m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f64m1(prod, half_prod, unpacket_traits::size); + } + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet1Xd& a) { + return ( + std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m1_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet1Xd& a) { + return ( + std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m1_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd pldexp(const Packet1Xd& a, const Packet1Xd& exponent) { + return pldexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 por(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 pandnot(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmor_mm_b64(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketMask64 pand(const PacketMask64& a, const PacketMask64& b) { + return __riscv_vmand_mm_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask64 pcmp_eq_mask(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vmfeq_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketMask64 pcmp_lt_mask(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vmflt_vv_f64m1_b64(a, b, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE Packet1Xd pselect(const PacketMask64& mask, const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vmerge_vvm_f64m1(b, a, mask, unpacket_traits::size); +} + +/********************************* short **************************************/ + +typedef eigen_packet_wrapper Packet1Xs; +typedef eigen_packet_wrapper Packet1Xsu; + +typedef eigen_packet_wrapper + Packet2Xs; +typedef eigen_packet_wrapper + Packet2Xsu; + +typedef eigen_packet_wrapper + Packet4Xs; +typedef eigen_packet_wrapper + Packet4Xsu; + +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef Packet1Xs PacketXs; +typedef Packet1Xsu PacketXsu; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet1Xs type; + typedef Packet1Xs half; // Half not implemented yet + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 2 +typedef Packet2Xs PacketXs; +typedef Packet2Xsu PacketXsu; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2Xs type; + typedef Packet1Xs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; + +#elif EIGEN_RISCV64_DEFAULT_LMUL == 4 +typedef Packet4Xs PacketXs; +typedef Packet4Xsu PacketXsu; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet4Xs type; + typedef Packet2Xs half; + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0 + }; +}; +#endif + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef Packet1Xs half; // Half not yet implemented + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef Packet1Xs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef numext::int16_t type; + typedef Packet2Xs half; + typedef numext::uint8_t mask_t; + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +EIGEN_STRONG_INLINE void prefetch(const numext::int16_t* addr) { +#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC + __builtin_prefetch(addr); +#endif +} + +/********************************* Packet1Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet1Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs plset(const numext::int16_t& a) { + Packet1Xs idx = __riscv_vreinterpret_v_u16m1_i16m1(__riscv_vid_v_u16m1(unpacket_traits::size)); + return __riscv_vadd_vx_i16m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pzero(const Packet1Xs& /*a*/) { + return __riscv_vmv_v_x_i16m1(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs padd(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vadd_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs psub(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pnegate(const Packet1Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pconj(const Packet1Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pmul(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pdiv(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pmadd(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pmsub(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pnmadd(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pnmsub(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c) { + return __riscv_vnmsub_vv_i16m1(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pmin(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pmax(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pcmp_le(const Packet1Xs& a, const Packet1Xs& b) { + PacketMask16 mask = __riscv_vmsle_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pcmp_lt(const Packet1Xs& a, const Packet1Xs& b) { + PacketMask16 mask = __riscv_vmslt_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pcmp_eq(const Packet1Xs& a, const Packet1Xs& b) { + PacketMask16 mask = __riscv_vmseq_vv_i16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m1(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs ptrue(const Packet1Xs& /*a*/) { + return __riscv_vmv_v_x_i16m1(static_cast(0xffffu), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pand(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vand_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs por(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vor_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pxor(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vxor_vv_i16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pandnot(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vand_vv_i16m1(a, __riscv_vnot_v_i16m1(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet1Xs parithmetic_shift_right(Packet1Xs a) { + return __riscv_vsra_vx_i16m1(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet1Xs plogical_shift_right(Packet1Xs a) { + return __riscv_vreinterpret_i16m1( + __riscv_vsrl_vx_u16m1(__riscv_vreinterpret_u16m1(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE Packet1Xs plogical_shift_left(Packet1Xs a) { + return __riscv_vsll_vx_i16m1(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m1(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs ploaddup(const numext::int16_t* from) { + Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs ploadquad(const numext::int16_t* from) { + Packet1Xsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m1(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const Packet1Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Packet1Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m1(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet1Xs pgather(const numext::int16_t* from, Index stride) { + return __riscv_vlse16_v_i16m1(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const Packet1Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const Packet1Xs& a) { + return __riscv_vmv_x_s_i16m1_i16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs preverse(const Packet1Xs& a) { + Packet1Xsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xs pabs(const Packet1Xs& a) { + Packet1Xs mask = __riscv_vsra_vx_i16m1(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const Packet1Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const Packet1Xs& a) { + // Multiply the vector by its reverse + Packet1Xs prod = __riscv_vmul_vv_i16m1(preverse(a), a, unpacket_traits::size); + Packet1Xs half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_i16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_i16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + half_prod = __riscv_vslidedown_vx_i16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vmul_vv_i16m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const Packet1Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const Packet1Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m1_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m1(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath2.h b/Eigen/src/Core/arch/RVV10/PacketMath2.h new file mode 100644 index 0000000000000000000000000000000000000000..1fda5113163ede2511fa780fbf4a9d0d973fe4a9 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMath2.h @@ -0,0 +1,1506 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET2_MATH_RVV10_H +#define EIGEN_PACKET2_MATH_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* Packet2Xi ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet2Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi plset(const numext::int32_t& a) { + Packet2Xi idx = __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)); + return __riscv_vadd_vx_i32m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pzero(const Packet2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi padd(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vadd_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi psub(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pnegate(const Packet2Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pconj(const Packet2Xi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pmul(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pdiv(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pmadd(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pmsub(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pnmadd(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pnmsub(const Packet2Xi& a, const Packet2Xi& b, const Packet2Xi& c) { + return __riscv_vnmsub_vv_i32m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pmin(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pmax(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pcmp_le(const Packet2Xi& a, const Packet2Xi& b) { + PacketMask16 mask = __riscv_vmsle_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pcmp_lt(const Packet2Xi& a, const Packet2Xi& b) { + PacketMask16 mask = __riscv_vmslt_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pcmp_eq(const Packet2Xi& a, const Packet2Xi& b) { + PacketMask16 mask = __riscv_vmseq_vv_i32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m2(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi ptrue(const Packet2Xi& /*a*/) { + return __riscv_vmv_v_x_i32m2(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pand(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vand_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi por(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vor_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pxor(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vxor_vv_i32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pandnot(const Packet2Xi& a, const Packet2Xi& b) { + return __riscv_vand_vv_i32m2(a, __riscv_vnot_v_i32m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet2Xi parithmetic_shift_right(Packet2Xi a) { + return __riscv_vsra_vx_i32m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet2Xi plogical_shift_right(Packet2Xi a) { + return __riscv_vreinterpret_i32m2( + __riscv_vsrl_vx_u32m2(__riscv_vreinterpret_u32m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE Packet2Xi plogical_shift_left(Packet2Xi a) { + return __riscv_vsll_vx_i32m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi ploaddup(const numext::int32_t* from) { + Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi ploadquad(const numext::int32_t* from) { + Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const Packet2Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Packet2Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2Xi pgather(const numext::int32_t* from, + Index stride) { + return __riscv_vlse32_v_i32m2(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const Packet2Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const Packet2Xi& a) { + return __riscv_vmv_x_s_i32m2_i32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi preverse(const Packet2Xi& a) { + Packet2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pabs(const Packet2Xi& a) { + Packet2Xi mask = __riscv_vsra_vx_i32m2(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m2(__riscv_vxor_vv_i32m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const Packet2Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const Packet2Xi& a) { + return predux_mul(__riscv_vmul_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const Packet2Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const Packet2Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m2_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xi>::type +predux_half_dowto4(const Packet4Xi& a) { + return __riscv_vadd_vv_i32m2(__riscv_vget_v_i32m4_i32m2(a, 0), __riscv_vget_v_i32m4_i32m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xi>::type +predux_half_dowto4(const Packet2Xi& a) { + return __riscv_vadd_vv_i32m1(__riscv_vget_v_i32m2_i32m1(a, 0), __riscv_vget_v_i32m2_i32m1(a, 1), + unpacket_traits::size); +} + +/********************************* Packet2Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet2Xf ptrue(const Packet2Xf& /*a*/) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pzero(const Packet2Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m2(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pabs(const Packet2Xf& a) { + return __riscv_vfabs_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m2(__riscv_vmv_v_x_u32m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf plset(const float& a) { + Packet2Xf idx = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vreinterpret_v_u32m2_i32m2(__riscv_vid_v_u32m2(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf padd(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfadd_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf psub(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfsub_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pnegate(const Packet2Xf& a) { + return __riscv_vfneg_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pconj(const Packet2Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pmul(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfmul_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pdiv(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfdiv_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pmadd(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) { + return __riscv_vfmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pmsub(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) { + return __riscv_vfmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pnmadd(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) { + return __riscv_vfnmsub_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pnmsub(const Packet2Xf& a, const Packet2Xf& b, const Packet2Xf& c) { + return __riscv_vfnmadd_vv_f32m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pmin(const Packet2Xf& a, const Packet2Xf& b) { + Packet2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pmin(const Packet2Xf& a, const Packet2Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pmin(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfmin_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pmax(const Packet2Xf& a, const Packet2Xf& b) { + Packet2Xf nans = + __riscv_vfmv_v_f_f32m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f32m2_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pmax(const Packet2Xf& a, const Packet2Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pmax(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vfmax_vv_f32m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pcmp_le(const Packet2Xf& a, const Packet2Xf& b) { + PacketMask16 mask = __riscv_vmfle_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pcmp_lt(const Packet2Xf& a, const Packet2Xf& b) { + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pcmp_eq(const Packet2Xf& a, const Packet2Xf& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pcmp_lt_or_nan(const Packet2Xf& a, const Packet2Xf& b) { + PacketMask16 mask = __riscv_vmfge_vv_f32m2_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m2(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE Packet2Xf pand(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf por(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pxor(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vxor_vv_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vreinterpret_v_f32m2_u32m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pandnot(const Packet2Xf& a, const Packet2Xf& b) { + return __riscv_vreinterpret_v_u32m2_f32m2(__riscv_vand_vv_u32m2( + __riscv_vreinterpret_v_f32m2_u32m2(a), + __riscv_vnot_v_u32m2(__riscv_vreinterpret_v_f32m2_u32m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf ploaddup(const float* from) { + Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m2(__riscv_vand_vx_u32m2(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf ploadquad(const float* from) { + Packet2Xu idx = __riscv_vid_v_u32m2(unpacket_traits::size); + idx = __riscv_vand_vx_u32m2(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet2Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet2Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m2(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet2Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet2Xf& a) { + return __riscv_vfmv_f_s_f32m2_f32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf psqrt(const Packet2Xf& a) { + return __riscv_vfsqrt_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf print(const Packet2Xf& a) { + const Packet2Xf limit = pset1(static_cast(1 << 23)); + const Packet2Xf abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f32m2_b16(a, a, unpacket_traits::size); + const Packet2Xf x = __riscv_vfadd_vv_f32m2_tumu(mask, a, a, a, unpacket_traits::size); + const Packet2Xf new_x = __riscv_vfcvt_f_x_v_f32m2( + __riscv_vfcvt_x_f_v_i32m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m2_b16(abs_a, limit, unpacket_traits::size); + Packet2Xf signed_x = __riscv_vfsgnj_vv_f32m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pfloor(const Packet2Xf& a) { + Packet2Xf tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f32m2_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m2_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf preverse(const Packet2Xf& a) { + Packet2Xu idx = + __riscv_vrsub_vx_u32m2(__riscv_vid_v_u32m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pfrexp(const Packet2Xf& a, Packet2Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const Packet2Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m2_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet2Xf& a) { + return predux_mul(__riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet2Xf& a) { + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m2_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet2Xf& a) { + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m2_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pldexp(const Packet2Xf& a, const Packet2Xf& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xf>::type +predux_half_dowto4(const Packet4Xf& a) { + return __riscv_vfadd_vv_f32m2(__riscv_vget_v_f32m4_f32m2(a, 0), __riscv_vget_v_f32m4_f32m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xf>::type +predux_half_dowto4(const Packet2Xf& a) { + return __riscv_vfadd_vv_f32m1(__riscv_vget_v_f32m2_f32m1(a, 0), __riscv_vget_v_f32m2_f32m1(a, 1), + unpacket_traits::size); +} + +/********************************* Packet2Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet2Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl plset(const numext::int64_t& a) { + Packet2Xl idx = __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)); + return __riscv_vadd_vx_i64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pzero(const Packet2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl padd(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vadd_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl psub(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pnegate(const Packet2Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pconj(const Packet2Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pmul(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pdiv(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pmadd(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pmsub(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pnmadd(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pnmsub(const Packet2Xl& a, const Packet2Xl& b, const Packet2Xl& c) { + return __riscv_vnmsub_vv_i64m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pmin(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pmax(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pcmp_le(const Packet2Xl& a, const Packet2Xl& b) { + PacketMask32 mask = __riscv_vmsle_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pcmp_lt(const Packet2Xl& a, const Packet2Xl& b) { + PacketMask32 mask = __riscv_vmslt_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pcmp_eq(const Packet2Xl& a, const Packet2Xl& b) { + PacketMask32 mask = __riscv_vmseq_vv_i64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m2(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl ptrue(const Packet2Xl& /*a*/) { + return __riscv_vmv_v_x_i64m2(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pand(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vand_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl por(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pxor(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vxor_vv_i64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pandnot(const Packet2Xl& a, const Packet2Xl& b) { + return __riscv_vand_vv_i64m2(a, __riscv_vnot_v_i64m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet2Xl parithmetic_shift_right(Packet2Xl a) { + return __riscv_vsra_vx_i64m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet2Xl plogical_shift_right(Packet2Xl a) { + return __riscv_vreinterpret_i64m2( + __riscv_vsrl_vx_u64m2(__riscv_vreinterpret_u64m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE Packet2Xl plogical_shift_left(Packet2Xl a) { + return __riscv_vsll_vx_i64m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl ploaddup(const numext::int64_t* from) { + Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl ploadquad(const numext::int64_t* from) { + Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const Packet2Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Packet2Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2Xl pgather(const numext::int64_t* from, + Index stride) { + return __riscv_vlse64_v_i64m2(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const Packet2Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const Packet2Xl& a) { + return __riscv_vmv_x_s_i64m2_i64(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl preverse(const Packet2Xl& a) { + Packet2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pabs(const Packet2Xl& a) { + Packet2Xl mask = __riscv_vsra_vx_i64m2(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m2(__riscv_vxor_vv_i64m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const Packet2Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const Packet2Xl& a) { + return predux_mul(__riscv_vmul_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const Packet2Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const Packet2Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m2_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xl>::type +predux_half_dowto4(const Packet4Xl& a) { + return __riscv_vadd_vv_i64m2(__riscv_vget_v_i64m4_i64m2(a, 0), __riscv_vget_v_i64m4_i64m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xl>::type +predux_half_dowto4(const Packet2Xl& a) { + return __riscv_vadd_vv_i64m1(__riscv_vget_v_i64m2_i64m1(a, 0), __riscv_vget_v_i64m2_i64m1(a, 1), + unpacket_traits::size); +} + +/********************************* Packet2Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet2Xd ptrue(const Packet2Xd& /*a*/) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pzero(const Packet2Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m2(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pabs(const Packet2Xd& a) { + return __riscv_vfabs_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m2(__riscv_vmv_v_x_u64m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd plset(const double& a) { + Packet2Xd idx = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vreinterpret_v_u64m2_i64m2(__riscv_vid_v_u64m2(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd padd(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfadd_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd psub(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfsub_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pnegate(const Packet2Xd& a) { + return __riscv_vfneg_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pconj(const Packet2Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pmul(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfmul_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pdiv(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfdiv_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pmadd(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) { + return __riscv_vfmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pmsub(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) { + return __riscv_vfmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pnmadd(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) { + return __riscv_vfnmsub_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pnmsub(const Packet2Xd& a, const Packet2Xd& b, const Packet2Xd& c) { + return __riscv_vfnmadd_vv_f64m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pmin(const Packet2Xd& a, const Packet2Xd& b) { + Packet2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pmin(const Packet2Xd& a, const Packet2Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pmin(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfmin_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pmax(const Packet2Xd& a, const Packet2Xd& b) { + Packet2Xd nans = + __riscv_vfmv_v_f_f64m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, a, unpacket_traits::size); + PacketMask32 mask2 = __riscv_vmfeq_vv_f64m2_b32(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b32(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m2_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pmax(const Packet2Xd& a, const Packet2Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pmax(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vfmax_vv_f64m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pcmp_le(const Packet2Xd& a, const Packet2Xd& b) { + PacketMask32 mask = __riscv_vmfle_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pcmp_lt(const Packet2Xd& a, const Packet2Xd& b) { + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pcmp_eq(const Packet2Xd& a, const Packet2Xd& b) { + PacketMask32 mask = __riscv_vmfeq_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pcmp_lt_or_nan(const Packet2Xd& a, const Packet2Xd& b) { + PacketMask32 mask = __riscv_vmfge_vv_f64m2_b32(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m2(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE Packet2Xd pand(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd por(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pxor(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vxor_vv_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vreinterpret_v_f64m2_u64m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pandnot(const Packet2Xd& a, const Packet2Xd& b) { + return __riscv_vreinterpret_v_u64m2_f64m2(__riscv_vand_vv_u64m2( + __riscv_vreinterpret_v_f64m2_u64m2(a), + __riscv_vnot_v_u64m2(__riscv_vreinterpret_v_f64m2_u64m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd ploaddup(const double* from) { + Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd ploadquad(const double* from) { + Packet2Xul idx = __riscv_vid_v_u64m2(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m2(__riscv_vand_vx_u64m2(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const Packet2Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet2Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m2(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet2Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet2Xd& a) { + return __riscv_vfmv_f_s_f64m2_f64(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd psqrt(const Packet2Xd& a) { + return __riscv_vfsqrt_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd print(const Packet2Xd& a) { + const Packet2Xd limit = pset1(static_cast(1ull << 52)); + const Packet2Xd abs_a = pabs(a); + + PacketMask32 mask = __riscv_vmfne_vv_f64m2_b32(a, a, unpacket_traits::size); + const Packet2Xd x = __riscv_vfadd_vv_f64m2_tumu(mask, a, a, a, unpacket_traits::size); + const Packet2Xd new_x = __riscv_vfcvt_f_x_v_f64m2( + __riscv_vfcvt_x_f_v_i64m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m2_b32(abs_a, limit, unpacket_traits::size); + Packet2Xd signed_x = __riscv_vfsgnj_vv_f64m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pfloor(const Packet2Xd& a) { + Packet2Xd tmp = print(a); + // If greater, subtract one. + PacketMask32 mask = __riscv_vmflt_vv_f64m2_b32(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m2_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd preverse(const Packet2Xd& a) { + Packet2Xul idx = + __riscv_vrsub_vx_u64m2(__riscv_vid_v_u64m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pfrexp(const Packet2Xd& a, Packet2Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const Packet2Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m2_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet2Xd& a) { + return predux_mul(__riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet2Xd& a) { + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m2_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet2Xd& a) { + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m2_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 2), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pldexp(const Packet2Xd& a, const Packet2Xd& exponent) { + return pldexp_generic(a, exponent); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xd>::type +predux_half_dowto4(const Packet4Xd& a) { + return __riscv_vfadd_vv_f64m2(__riscv_vget_v_f64m4_f64m2(a, 0), __riscv_vget_v_f64m4_f64m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xd>::type +predux_half_dowto4(const Packet2Xd& a) { + return __riscv_vfadd_vv_f64m1(__riscv_vget_v_f64m2_f64m1(a, 0), __riscv_vget_v_f64m2_f64m1(a, 1), + unpacket_traits::size); +} + +/********************************* Packet2Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet2Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs plset(const numext::int16_t& a) { + Packet2Xs idx = __riscv_vreinterpret_v_u16m2_i16m2(__riscv_vid_v_u16m2(unpacket_traits::size)); + return __riscv_vadd_vx_i16m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pzero(const Packet2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs padd(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vadd_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs psub(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pnegate(const Packet2Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pconj(const Packet2Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pmul(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pdiv(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pmadd(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pmsub(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pnmadd(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pnmsub(const Packet2Xs& a, const Packet2Xs& b, const Packet2Xs& c) { + return __riscv_vnmsub_vv_i16m2(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pmin(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pmax(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pcmp_le(const Packet2Xs& a, const Packet2Xs& b) { + PacketMask8 mask = __riscv_vmsle_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pcmp_lt(const Packet2Xs& a, const Packet2Xs& b) { + PacketMask8 mask = __riscv_vmslt_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pcmp_eq(const Packet2Xs& a, const Packet2Xs& b) { + PacketMask8 mask = __riscv_vmseq_vv_i16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m2(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs ptrue(const Packet2Xs& /*a*/) { + return __riscv_vmv_v_x_i16m2(static_cast(0xffffu), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pand(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vand_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs por(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pxor(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vxor_vv_i16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pandnot(const Packet2Xs& a, const Packet2Xs& b) { + return __riscv_vand_vv_i16m2(a, __riscv_vnot_v_i16m2(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet2Xs parithmetic_shift_right(Packet2Xs a) { + return __riscv_vsra_vx_i16m2(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet2Xs plogical_shift_right(Packet2Xs a) { + return __riscv_vreinterpret_i16m2( + __riscv_vsrl_vx_u16m2(__riscv_vreinterpret_u16m2(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE Packet2Xs plogical_shift_left(Packet2Xs a) { + return __riscv_vsll_vx_i16m2(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m2(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs ploaddup(const numext::int16_t* from) { + Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs ploadquad(const numext::int16_t* from) { + Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m2(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const Packet2Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Packet2Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m2(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2Xs pgather(const numext::int16_t* from, + Index stride) { + return __riscv_vlse16_v_i16m2(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const Packet2Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const Packet2Xs& a) { + return __riscv_vmv_x_s_i16m2_i16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs preverse(const Packet2Xs& a) { + Packet2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pabs(const Packet2Xs& a) { + Packet2Xs mask = __riscv_vsra_vx_i16m2(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m2(__riscv_vxor_vv_i16m2(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const Packet2Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 2), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const Packet2Xs& a) { + return predux_mul(__riscv_vmul_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const Packet2Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const Packet2Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m2_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 2), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m2(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet2Xs>::type +predux_half_dowto4(const Packet4Xs& a) { + return __riscv_vadd_vv_i16m2(__riscv_vget_v_i16m4_i16m2(a, 0), __riscv_vget_v_i16m4_i16m2(a, 1), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + Packet1Xs>::type +predux_half_dowto4(const Packet2Xs& a) { + return __riscv_vadd_vv_i16m1(__riscv_vget_v_i16m2_i16m1(a, 0), __riscv_vget_v_i16m2_i16m1(a, 1), + unpacket_traits::size); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET2_MATH_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMath4.h b/Eigen/src/Core/arch/RVV10/PacketMath4.h new file mode 100644 index 0000000000000000000000000000000000000000..30f5ca33d598c8dc82ec9a593fc35d1a6e4d5ad5 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMath4.h @@ -0,0 +1,1431 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET4_MATH_RVV10_H +#define EIGEN_PACKET4_MATH_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* Packet4Xi ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet4Xi pset1(const numext::int32_t& from) { + return __riscv_vmv_v_x_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi plset(const numext::int32_t& a) { + Packet4Xi idx = __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)); + return __riscv_vadd_vx_i32m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pzero(const Packet4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi padd(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vadd_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi psub(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pnegate(const Packet4Xi& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pconj(const Packet4Xi& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pmul(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pdiv(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pmadd(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pmsub(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pnmadd(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pnmsub(const Packet4Xi& a, const Packet4Xi& b, const Packet4Xi& c) { + return __riscv_vnmsub_vv_i32m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pmin(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pmax(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pcmp_le(const Packet4Xi& a, const Packet4Xi& b) { + PacketMask8 mask = __riscv_vmsle_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pcmp_lt(const Packet4Xi& a, const Packet4Xi& b) { + PacketMask8 mask = __riscv_vmslt_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pcmp_eq(const Packet4Xi& a, const Packet4Xi& b) { + PacketMask8 mask = __riscv_vmseq_vv_i32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i32m4(pzero(a), 0xffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi ptrue(const Packet4Xi& /*a*/) { + return __riscv_vmv_v_x_i32m4(0xffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pand(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vand_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi por(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vor_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pxor(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vxor_vv_i32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pandnot(const Packet4Xi& a, const Packet4Xi& b) { + return __riscv_vand_vv_i32m4(a, __riscv_vnot_v_i32m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet4Xi parithmetic_shift_right(Packet4Xi a) { + return __riscv_vsra_vx_i32m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet4Xi plogical_shift_right(Packet4Xi a) { + return __riscv_vreinterpret_i32m4( + __riscv_vsrl_vx_u32m4(__riscv_vreinterpret_u32m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE Packet4Xi plogical_shift_left(Packet4Xi a) { + return __riscv_vsll_vx_i32m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pload(const numext::int32_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi ploadu(const numext::int32_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_i32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi ploaddup(const numext::int32_t* from) { + Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + // idx = 0 0 sizeof(int32_t) sizeof(int32_t) 2*sizeof(int32_t) 2*sizeof(int32_t) ... + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi ploadquad(const numext::int32_t* from) { + Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_i32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int32_t* to, const Packet4Xi& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int32_t* to, const Packet4Xi& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_i32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4Xi pgather(const numext::int32_t* from, + Index stride) { + return __riscv_vlse32_v_i32m4(from, stride * sizeof(numext::int32_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int32_t* to, const Packet4Xi& from, + Index stride) { + __riscv_vsse32(to, stride * sizeof(numext::int32_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t pfirst(const Packet4Xi& a) { + return __riscv_vmv_x_s_i32m4_i32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi preverse(const Packet4Xi& a) { + Packet4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i32m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pabs(const Packet4Xi& a) { + Packet4Xi mask = __riscv_vsra_vx_i32m4(a, 31, unpacket_traits::size); + return __riscv_vsub_vv_i32m4(__riscv_vxor_vv_i32m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux(const Packet4Xi& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_mul(const Packet4Xi& a) { + Packet1Xi half1 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 0), __riscv_vget_v_i32m4_i32m1(a, 1), + unpacket_traits::size); + Packet1Xi half2 = __riscv_vmul_vv_i32m1(__riscv_vget_v_i32m4_i32m1(a, 2), __riscv_vget_v_i32m4_i32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i32m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_min(const Packet4Xi& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int32_t predux_max(const Packet4Xi& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i32m4_i32m1( + a, __riscv_vmv_v_x_i32m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int32_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(numext::int32_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_i32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* Packet4Xf ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet4Xf ptrue(const Packet4Xf& /*a*/) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(0xffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pzero(const Packet4Xf& /*a*/) { + return __riscv_vfmv_v_f_f32m4(0.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pabs(const Packet4Xf& a) { + return __riscv_vfabs_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pset1(const float& from) { + return __riscv_vfmv_v_f_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pset1frombits(numext::uint32_t from) { + return __riscv_vreinterpret_f32m4(__riscv_vmv_v_x_u32m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf plset(const float& a) { + Packet4Xf idx = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vreinterpret_v_u32m4_i32m4(__riscv_vid_v_u32m4(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f32m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf padd(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfadd_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf psub(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfsub_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pnegate(const Packet4Xf& a) { + return __riscv_vfneg_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pconj(const Packet4Xf& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pmul(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfmul_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pdiv(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfdiv_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pmadd(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) { + return __riscv_vfmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pmsub(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) { + return __riscv_vfmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pnmadd(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) { + return __riscv_vfnmsub_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pnmsub(const Packet4Xf& a, const Packet4Xf& b, const Packet4Xf& c) { + return __riscv_vfnmadd_vv_f32m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pmin(const Packet4Xf& a, const Packet4Xf& b) { + Packet4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pmin(const Packet4Xf& a, const Packet4Xf& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pmin(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfmin_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pmax(const Packet4Xf& a, const Packet4Xf& b) { + Packet4Xf nans = + __riscv_vfmv_v_f_f32m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f32m4_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f32m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pmax(const Packet4Xf& a, const Packet4Xf& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pmax(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vfmax_vv_f32m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pcmp_le(const Packet4Xf& a, const Packet4Xf& b) { + PacketMask8 mask = __riscv_vmfle_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pcmp_lt(const Packet4Xf& a, const Packet4Xf& b) { + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pcmp_eq(const Packet4Xf& a, const Packet4Xf& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pcmp_lt_or_nan(const Packet4Xf& a, const Packet4Xf& b) { + PacketMask8 mask = __riscv_vmfge_vv_f32m4_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f32m4(ptrue(a), 0.0f, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for float, so reinterpret casts +template <> +EIGEN_STRONG_INLINE Packet4Xf pand(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf por(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pxor(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vxor_vv_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vreinterpret_v_f32m4_u32m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pandnot(const Packet4Xf& a, const Packet4Xf& b) { + return __riscv_vreinterpret_v_u32m4_f32m4(__riscv_vand_vv_u32m4( + __riscv_vreinterpret_v_f32m4_u32m4(a), + __riscv_vnot_v_u32m4(__riscv_vreinterpret_v_f32m4_u32m4(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pload(const float* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf ploadu(const float* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle32_v_f32m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf ploaddup(const float* from) { + Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u32m4(__riscv_vand_vx_u32m4(idx, 0xfffffffeu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf ploadquad(const float* from) { + Packet4Xu idx = __riscv_vid_v_u32m4(unpacket_traits::size); + idx = __riscv_vand_vx_u32m4(idx, 0xfffffffcu, unpacket_traits::size); + return __riscv_vloxei32_v_f32m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(float* to, const Packet4Xf& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4Xf& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse32_v_f32m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4Xf pgather(const float* from, Index stride) { + return __riscv_vlse32_v_f32m4(from, stride * sizeof(float), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet4Xf& from, Index stride) { + __riscv_vsse32(to, stride * sizeof(float), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE float pfirst(const Packet4Xf& a) { + return __riscv_vfmv_f_s_f32m4_f32(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf psqrt(const Packet4Xf& a) { + return __riscv_vfsqrt_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf print(const Packet4Xf& a) { + const Packet4Xf limit = pset1(static_cast(1 << 23)); + const Packet4Xf abs_a = pabs(a); + + PacketMask8 mask = __riscv_vmfne_vv_f32m4_b8(a, a, unpacket_traits::size); + const Packet4Xf x = __riscv_vfadd_vv_f32m4_tumu(mask, a, a, a, unpacket_traits::size); + const Packet4Xf new_x = __riscv_vfcvt_f_x_v_f32m4( + __riscv_vfcvt_x_f_v_i32m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f32m4_b8(abs_a, limit, unpacket_traits::size); + Packet4Xf signed_x = __riscv_vfsgnj_vv_f32m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f32m4(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pfloor(const Packet4Xf& a) { + Packet4Xf tmp = print(a); + // If greater, subtract one. + PacketMask8 mask = __riscv_vmflt_vv_f32m4_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f32m4_tumu(mask, tmp, tmp, 1.0f, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf preverse(const Packet4Xf& a) { + Packet4Xu idx = + __riscv_vrsub_vx_u32m4(__riscv_vid_v_u32m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f32m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pfrexp(const Packet4Xf& a, Packet4Xf& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE float predux(const Packet4Xf& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f32m4_f32m1( + a, __riscv_vfmv_v_f_f32m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet4Xf& a) { + Packet1Xf half1 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 0), __riscv_vget_v_f32m4_f32m1(a, 1), + unpacket_traits::size); + Packet1Xf half2 = __riscv_vfmul_vv_f32m1(__riscv_vget_v_f32m4_f32m1(a, 2), __riscv_vget_v_f32m4_f32m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f32m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet4Xf& a) { + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f32m4_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet4Xf& a) { + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f32m4_f32m1( + a, + __riscv_vfmv_v_f_f32m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + float buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse32(&buffer[i], N * sizeof(float), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle32_v_f32m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pldexp(const Packet4Xf& a, const Packet4Xf& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* Packet4Xl ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet4Xl pset1(const numext::int64_t& from) { + return __riscv_vmv_v_x_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl plset(const numext::int64_t& a) { + Packet4Xl idx = __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)); + return __riscv_vadd_vx_i64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pzero(const Packet4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl padd(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vadd_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl psub(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pnegate(const Packet4Xl& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pconj(const Packet4Xl& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pmul(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pdiv(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pmadd(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pmsub(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pnmadd(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pnmsub(const Packet4Xl& a, const Packet4Xl& b, const Packet4Xl& c) { + return __riscv_vnmsub_vv_i64m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pmin(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pmax(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pcmp_le(const Packet4Xl& a, const Packet4Xl& b) { + PacketMask16 mask = __riscv_vmsle_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pcmp_lt(const Packet4Xl& a, const Packet4Xl& b) { + PacketMask16 mask = __riscv_vmslt_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pcmp_eq(const Packet4Xl& a, const Packet4Xl& b) { + PacketMask16 mask = __riscv_vmseq_vv_i64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i64m4(pzero(a), 0xffffffffffffffff, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl ptrue(const Packet4Xl& /*a*/) { + return __riscv_vmv_v_x_i64m4(0xffffffffffffffffu, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pand(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vand_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl por(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pxor(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vxor_vv_i64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pandnot(const Packet4Xl& a, const Packet4Xl& b) { + return __riscv_vand_vv_i64m4(a, __riscv_vnot_v_i64m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet4Xl parithmetic_shift_right(Packet4Xl a) { + return __riscv_vsra_vx_i64m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet4Xl plogical_shift_right(Packet4Xl a) { + return __riscv_vreinterpret_i64m4( + __riscv_vsrl_vx_u64m4(__riscv_vreinterpret_u64m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE Packet4Xl plogical_shift_left(Packet4Xl a) { + return __riscv_vsll_vx_i64m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pload(const numext::int64_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl ploadu(const numext::int64_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_i64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl ploaddup(const numext::int64_t* from) { + Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + // idx = 0 0 sizeof(int64_t) sizeof(int64_t) 2*sizeof(int64_t) 2*sizeof(int64_t) ... + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl ploadquad(const numext::int64_t* from) { + Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_i64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int64_t* to, const Packet4Xl& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int64_t* to, const Packet4Xl& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_i64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4Xl pgather(const numext::int64_t* from, + Index stride) { + return __riscv_vlse64_v_i64m4(from, stride * sizeof(numext::int64_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int64_t* to, const Packet4Xl& from, + Index stride) { + __riscv_vsse64(to, stride * sizeof(numext::int64_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t pfirst(const Packet4Xl& a) { + return __riscv_vmv_x_s_i64m4_i64(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl preverse(const Packet4Xl& a) { + Packet4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pabs(const Packet4Xl& a) { + Packet4Xl mask = __riscv_vsra_vx_i64m4(a, 63, unpacket_traits::size); + return __riscv_vsub_vv_i64m4(__riscv_vxor_vv_i64m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux(const Packet4Xl& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_mul(const Packet4Xl& a) { + Packet1Xl half1 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 0), __riscv_vget_v_i64m4_i64m1(a, 1), + unpacket_traits::size); + Packet1Xl half2 = __riscv_vmul_vv_i64m1(__riscv_vget_v_i64m4_i64m1(a, 2), __riscv_vget_v_i64m4_i64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_min(const Packet4Xl& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int64_t predux_max(const Packet4Xl& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i64m4_i64m1( + a, __riscv_vmv_v_x_i64m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int64_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(numext::int64_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_i64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +/********************************* Packet4Xd ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet4Xd ptrue(const Packet4Xd& /*a*/) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(0xffffffffffffffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pzero(const Packet4Xd& /*a*/) { + return __riscv_vfmv_v_f_f64m4(0.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pabs(const Packet4Xd& a) { + return __riscv_vfabs_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pset1(const double& from) { + return __riscv_vfmv_v_f_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pset1frombits(numext::uint64_t from) { + return __riscv_vreinterpret_f64m4(__riscv_vmv_v_x_u64m4(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd plset(const double& a) { + Packet4Xd idx = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vreinterpret_v_u64m4_i64m4(__riscv_vid_v_u64m4(unpacket_traits::size)), + unpacket_traits::size); + return __riscv_vfadd_vf_f64m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd padd(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfadd_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd psub(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfsub_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pnegate(const Packet4Xd& a) { + return __riscv_vfneg_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pconj(const Packet4Xd& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pmul(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfmul_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pdiv(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfdiv_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pmadd(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) { + return __riscv_vfmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pmsub(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) { + return __riscv_vfmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pnmadd(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) { + return __riscv_vfnmsub_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pnmsub(const Packet4Xd& a, const Packet4Xd& b, const Packet4Xd& c) { + return __riscv_vfnmadd_vv_f64m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pmin(const Packet4Xd& a, const Packet4Xd& b) { + Packet4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pmin(const Packet4Xd& a, const Packet4Xd& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pmin(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfmin_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pmax(const Packet4Xd& a, const Packet4Xd& b) { + Packet4Xd nans = + __riscv_vfmv_v_f_f64m4((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f64m4_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f64m4_tumu(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pmax(const Packet4Xd& a, const Packet4Xd& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pmax(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vfmax_vv_f64m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pcmp_le(const Packet4Xd& a, const Packet4Xd& b) { + PacketMask16 mask = __riscv_vmfle_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pcmp_lt(const Packet4Xd& a, const Packet4Xd& b) { + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pcmp_eq(const Packet4Xd& a, const Packet4Xd& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pcmp_lt_or_nan(const Packet4Xd& a, const Packet4Xd& b) { + PacketMask16 mask = __riscv_vmfge_vv_f64m4_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f64m4(ptrue(a), 0.0, mask, unpacket_traits::size); +} + +// Logical Operations are not supported for double, so reinterpret casts +template <> +EIGEN_STRONG_INLINE Packet4Xd pand(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd por(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pxor(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vxor_vv_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vreinterpret_v_f64m4_u64m4(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pandnot(const Packet4Xd& a, const Packet4Xd& b) { + return __riscv_vreinterpret_v_u64m4_f64m4(__riscv_vand_vv_u64m4( + __riscv_vreinterpret_v_f64m4_u64m4(a), + __riscv_vnot_v_u64m4(__riscv_vreinterpret_v_f64m4_u64m4(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pload(const double* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd ploadu(const double* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle64_v_f64m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd ploaddup(const double* from) { + Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffeu, unpacket_traits::size), 2, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd ploadquad(const double* from) { + Packet4Xul idx = __riscv_vid_v_u64m4(unpacket_traits::size); + idx = __riscv_vsll_vx_u64m4(__riscv_vand_vx_u64m4(idx, 0xfffffffffffffffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei64_v_f64m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(double* to, const Packet4Xd& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(double* to, const Packet4Xd& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse64_v_f64m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4Xd pgather(const double* from, Index stride) { + return __riscv_vlse64_v_f64m4(from, stride * sizeof(double), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet4Xd& from, Index stride) { + __riscv_vsse64(to, stride * sizeof(double), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE double pfirst(const Packet4Xd& a) { + return __riscv_vfmv_f_s_f64m4_f64(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd psqrt(const Packet4Xd& a) { + return __riscv_vfsqrt_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd print(const Packet4Xd& a) { + const Packet4Xd limit = pset1(static_cast(1ull << 52)); + const Packet4Xd abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f64m4_b16(a, a, unpacket_traits::size); + const Packet4Xd x = __riscv_vfadd_vv_f64m4_tumu(mask, a, a, a, unpacket_traits::size); + const Packet4Xd new_x = __riscv_vfcvt_f_x_v_f64m4( + __riscv_vfcvt_x_f_v_i64m4(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f64m4_b16(abs_a, limit, unpacket_traits::size); + Packet4Xd signed_x = __riscv_vfsgnj_vv_f64m4(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f64m4(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pfloor(const Packet4Xd& a) { + Packet4Xd tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f64m4_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f64m4_tumu(mask, tmp, tmp, 1.0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd preverse(const Packet4Xd& a) { + Packet4Xul idx = + __riscv_vrsub_vx_u64m4(__riscv_vid_v_u64m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f64m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pfrexp(const Packet4Xd& a, Packet4Xd& exponent) { + return pfrexp_generic(a, exponent); +} + +template <> +EIGEN_STRONG_INLINE double predux(const Packet4Xd& a) { + return __riscv_vfmv_f(__riscv_vfredusum_vs_f64m4_f64m1( + a, __riscv_vfmv_v_f_f64m1(0.0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet4Xd& a) { + Packet1Xd half1 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 0), __riscv_vget_v_f64m4_f64m1(a, 1), + unpacket_traits::size); + Packet1Xd half2 = __riscv_vfmul_vv_f64m1(__riscv_vget_v_f64m4_f64m1(a, 2), __riscv_vget_v_f64m4_f64m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vfmul_vv_f64m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet4Xd& a) { + return (std::min)(__riscv_vfmv_f(__riscv_vfredmin_vs_f64m4_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + (std::numeric_limits::max)()); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet4Xd& a) { + return (std::max)(__riscv_vfmv_f(__riscv_vfredmax_vs_f64m4_f64m1( + a, + __riscv_vfmv_v_f_f64m1((std::numeric_limits::quiet_NaN)(), + unpacket_traits::size / 4), + unpacket_traits::size)), + -(std::numeric_limits::max)()); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + double buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse64(&buffer[i], N * sizeof(double), kernel.packet[i], unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle64_v_f64m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pldexp(const Packet4Xd& a, const Packet4Xd& exponent) { + return pldexp_generic(a, exponent); +} + +/********************************* Packet4Xs ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet4Xs pset1(const numext::int16_t& from) { + return __riscv_vmv_v_x_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs plset(const numext::int16_t& a) { + Packet4Xs idx = __riscv_vreinterpret_v_u16m4_i16m4(__riscv_vid_v_u16m4(unpacket_traits::size)); + return __riscv_vadd_vx_i16m4(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pzero(const Packet4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(0, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs padd(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vadd_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs psub(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vsub(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pnegate(const Packet4Xs& a) { + return __riscv_vneg(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pconj(const Packet4Xs& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pmul(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vmul(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pdiv(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vdiv(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pmadd(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) { + return __riscv_vmadd(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pmsub(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) { + return __riscv_vmadd(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pnmadd(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pnmsub(const Packet4Xs& a, const Packet4Xs& b, const Packet4Xs& c) { + return __riscv_vnmsub_vv_i16m4(a, b, pnegate(c), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pmin(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vmin(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pmax(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vmax(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pcmp_le(const Packet4Xs& a, const Packet4Xs& b) { + PacketMask4 mask = __riscv_vmsle_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pcmp_lt(const Packet4Xs& a, const Packet4Xs& b) { + PacketMask4 mask = __riscv_vmslt_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pcmp_eq(const Packet4Xs& a, const Packet4Xs& b) { + PacketMask4 mask = __riscv_vmseq_vv_i16m4_b4(a, b, unpacket_traits::size); + return __riscv_vmerge_vxm_i16m4(pzero(a), static_cast(0xffff), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs ptrue(const Packet4Xs& /*a*/) { + return __riscv_vmv_v_x_i16m4(static_cast(0xffffu), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pand(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vand_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs por(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pxor(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vxor_vv_i16m4(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pandnot(const Packet4Xs& a, const Packet4Xs& b) { + return __riscv_vand_vv_i16m4(a, __riscv_vnot_v_i16m4(b, unpacket_traits::size), + unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet4Xs parithmetic_shift_right(Packet4Xs a) { + return __riscv_vsra_vx_i16m4(a, N, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE Packet4Xs plogical_shift_right(Packet4Xs a) { + return __riscv_vreinterpret_i16m4( + __riscv_vsrl_vx_u16m4(__riscv_vreinterpret_u16m4(a), N, unpacket_traits::size)); +} + +template +EIGEN_STRONG_INLINE Packet4Xs plogical_shift_left(Packet4Xs a) { + return __riscv_vsll_vx_i16m4(a, N, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pload(const numext::int16_t* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs ploadu(const numext::int16_t* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_i16m4(from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs ploaddup(const numext::int16_t* from) { + Packet4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vand_vx_u16m4(idx, 0xfffeu, unpacket_traits::size); + // idx = 0 0 sizeof(int16_t) sizeof(int16_t) 2*sizeof(int16_t) 2*sizeof(int16_t) ... + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs ploadquad(const numext::int16_t* from) { + Packet4Xsu idx = __riscv_vid_v_u16m4(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m4(__riscv_vand_vx_u16m4(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_i16m4(from, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(numext::int16_t* to, const Packet4Xs& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(numext::int16_t* to, const Packet4Xs& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_i16m4(to, from, unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet4Xs pgather(const numext::int16_t* from, + Index stride) { + return __riscv_vlse16_v_i16m4(from, stride * sizeof(numext::int16_t), unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(numext::int16_t* to, const Packet4Xs& from, + Index stride) { + __riscv_vsse16(to, stride * sizeof(numext::int16_t), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t pfirst(const Packet4Xs& a) { + return __riscv_vmv_x_s_i16m4_i16(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs preverse(const Packet4Xs& a) { + Packet4Xsu idx = + __riscv_vrsub_vx_u16m4(__riscv_vid_v_u16m4(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_i16m4(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pabs(const Packet4Xs& a) { + Packet4Xs mask = __riscv_vsra_vx_i16m4(a, 15, unpacket_traits::size); + return __riscv_vsub_vv_i16m4(__riscv_vxor_vv_i16m4(a, mask, unpacket_traits::size), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux(const Packet4Xs& a) { + return __riscv_vmv_x(__riscv_vredsum_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1(0, unpacket_traits::size / 4), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_mul(const Packet4Xs& a) { + Packet1Xs half1 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 0), __riscv_vget_v_i16m4_i16m1(a, 1), + unpacket_traits::size); + Packet1Xs half2 = __riscv_vmul_vv_i16m1(__riscv_vget_v_i16m4_i16m1(a, 2), __riscv_vget_v_i16m4_i16m1(a, 3), + unpacket_traits::size); + return predux_mul(__riscv_vmul_vv_i16m1(half1, half2, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_min(const Packet4Xs& a) { + return __riscv_vmv_x(__riscv_vredmin_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE numext::int16_t predux_max(const Packet4Xs& a) { + return __riscv_vmv_x(__riscv_vredmax_vs_i16m4_i16m1( + a, __riscv_vmv_v_x_i16m1((std::numeric_limits::min)(), unpacket_traits::size / 4), + unpacket_traits::size)); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + numext::int16_t buffer[unpacket_traits::size * N] = {0}; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(&buffer[i], N * sizeof(numext::int16_t), kernel.packet[i], unpacket_traits::size); + } + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_i16m4(&buffer[i * unpacket_traits::size], unpacket_traits::size); + } +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET4_MATH_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/PacketMathFP16.h b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h new file mode 100644 index 0000000000000000000000000000000000000000..fbda1913813617db87f6be6f1cd4bc01ce58b608 --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/PacketMathFP16.h @@ -0,0 +1,922 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2025 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_PACKET_MATH_FP16_RVV10_H +#define EIGEN_PACKET_MATH_FP16_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +typedef vfloat16m1_t Packet1Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL))); +typedef vfloat16m2_t Packet2Xh __attribute__((riscv_rvv_vector_bits(EIGEN_RISCV64_RVV_VL * 2))); + +#if EIGEN_RISCV64_DEFAULT_LMUL == 1 +typedef Packet1Xh PacketXh; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet1Xh type; + typedef Packet1Xh half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0 + }; +}; + +#else +typedef Packet2Xh PacketXh; + +template <> +struct packet_traits : default_packet_traits { + typedef Packet2Xh type; + typedef Packet1Xh half; + + enum { + Vectorizable = 1, + AlignedOnScalar = 1, + size = rvv_packet_size_selector::size, + + HasAdd = 1, + HasSub = 1, + HasShift = 1, + HasMul = 1, + HasNegate = 1, + HasAbs = 1, + HasArg = 0, + HasAbs2 = 1, + HasMin = 1, + HasMax = 1, + HasConj = 1, + HasSetLinear = 0, + HasBlend = 0, + HasReduxp = 0, + + HasCmp = 1, + HasDiv = 1, + HasRound = 1, + + HasSin = EIGEN_FAST_MATH, + HasCos = EIGEN_FAST_MATH, + HasLog = 0, + HasExp = 0, + HasSqrt = 1, + HasTanh = EIGEN_FAST_MATH, + HasErf = 0 + }; +}; +#endif + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet1Xh half; // Half not yet implemented + typedef PacketXs integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +template <> +struct unpacket_traits { + typedef Eigen::half type; + typedef Packet1Xh half; + typedef Packet2Xs integer_packet; + typedef numext::uint8_t mask_t; + + enum { + size = rvv_packet_size_selector::size, + alignment = rvv_packet_alignment_selector::alignment, + vectorizable = true, + masked_load_available = false, + masked_store_available = false + }; +}; + +/********************************* PacketXh ************************************/ + +template <> +EIGEN_STRONG_INLINE PacketXh ptrue(const PacketXh& /*a*/) { + return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(0xffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pzero(const PacketXh& /*a*/) { + return __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pabs(const PacketXh& a) { + return __riscv_vfabs_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pset1(const Eigen::half& from) { + return __riscv_vfmv_v_f_f16m1(static_cast<_Float16>(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pset1frombits(numext::uint16_t from) { + return __riscv_vreinterpret_f16m1(__riscv_vmv_v_x_u16m1(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh plset(const Eigen::half& a) { + PacketXh idx = + __riscv_vfcvt_f_x_v_f16m1(__riscv_vid_v_i16m1(unpacket_traits::size), unpacket_traits::size); + return __riscv_vfadd_vf_f16m1(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh padd(const PacketXh& a, const PacketXh& b) { + return __riscv_vfadd_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh psub(const PacketXh& a, const PacketXh& b) { + return __riscv_vfsub_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnegate(const PacketXh& a) { + return __riscv_vfneg_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pconj(const PacketXh& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmul(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmul_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pdiv(const PacketXh& a, const PacketXh& b) { + return __riscv_vfdiv_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfmadd_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfmsub_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnmadd(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfnmsub_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pnmsub(const PacketXh& a, const PacketXh& b, const PacketXh& c) { + return __riscv_vfnmadd_vv_f16m1(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + PacketXh nans = + __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmin(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmin_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + PacketXh nans = + __riscv_vfmv_v_f_f16m1((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, a, unpacket_traits::size); + PacketMask16 mask2 = __riscv_vmfeq_vv_f16m1_b16(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b16(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f16m1_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pmax(const PacketXh& a, const PacketXh& b) { + return __riscv_vfmax_vv_f16m1(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_le(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfle_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_lt(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_eq(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfeq_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(pzero(a), ptrue(a), mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pcmp_lt_or_nan(const PacketXh& a, const PacketXh& b) { + PacketMask16 mask = __riscv_vmfge_vv_f16m1_b16(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m1(ptrue(a), static_cast(0.0), mask, + unpacket_traits::size); +} + +// Logical Operations are not supported for half, so reinterpret casts +template <> +EIGEN_STRONG_INLINE PacketXh pand(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh por(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vor_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pxor(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vxor_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), __riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pandnot(const PacketXh& a, const PacketXh& b) { + return __riscv_vreinterpret_v_u16m1_f16m1(__riscv_vand_vv_u16m1( + __riscv_vreinterpret_v_f16m1_u16m1(a), + __riscv_vnot_v_u16m1(__riscv_vreinterpret_v_f16m1_u16m1(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m1(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploaddup(const Eigen::half* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vand_vx_u16m1(idx, 0xfffeu, unpacket_traits::size); + return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh ploadquad(const Eigen::half* from) { + PacketXsu idx = __riscv_vid_v_u16m1(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m1(__riscv_vand_vx_u16m1(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_f16m1(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const PacketXh& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const PacketXh& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m1(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline PacketXh pgather(const Eigen::half* from, Index stride) { + return __riscv_vlse16_v_f16m1(reinterpret_cast(from), stride * sizeof(Eigen::half), + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const PacketXh& from, Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const PacketXh& a) { + return static_cast(__riscv_vfmv_f_s_f16m1_f16(a)); +} + +template <> +EIGEN_STRONG_INLINE PacketXh psqrt(const PacketXh& a) { + return __riscv_vfsqrt_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh print(const PacketXh& a) { + const PacketXh limit = pset1(static_cast(1 << 10)); + const PacketXh abs_a = pabs(a); + + PacketMask16 mask = __riscv_vmfne_vv_f16m1_b16(a, a, unpacket_traits::size); + const PacketXh x = __riscv_vfadd_vv_f16m1_tum(mask, a, a, a, unpacket_traits::size); + const PacketXh new_x = __riscv_vfcvt_f_x_v_f16m1(__riscv_vfcvt_x_f_v_i16m1(a, unpacket_traits::size), + unpacket_traits::size); + + mask = __riscv_vmflt_vv_f16m1_b16(abs_a, limit, unpacket_traits::size); + PacketXh signed_x = __riscv_vfsgnj_vv_f16m1(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m1(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh pfloor(const PacketXh& a) { + PacketXh tmp = print(a); + // If greater, subtract one. + PacketMask16 mask = __riscv_vmflt_vv_f16m1_b16(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f16m1_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh preverse(const PacketXh& a) { + PacketXsu idx = __riscv_vrsub_vx_u16m1(__riscv_vid_v_u16m1(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f16m1(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const PacketXh& a) { + // Multiply the vector by its reverse + PacketXh prod = __riscv_vfmul_vv_f16m1(preverse(a), a, unpacket_traits::size); + PacketXh half_prod; + + if (EIGEN_RISCV64_RVV_VL >= 1024) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 16, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 512) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 8, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + if (EIGEN_RISCV64_RVV_VL >= 256) { + half_prod = __riscv_vslidedown_vx_f16m1(prod, 4, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + } + // Last reduction + half_prod = __riscv_vslidedown_vx_f16m1(prod, 2, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + + half_prod = __riscv_vslidedown_vx_f16m1(prod, 1, unpacket_traits::size); + prod = __riscv_vfmul_vv_f16m1(prod, half_prod, unpacket_traits::size); + + // The reduction is done to the first element. + return pfirst(prod); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const PacketXh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m1_f16m1( + a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size), + unpacket_traits::size))); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Eigen::half buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], + unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = __riscv_vle16_v_f16m1(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); + } +} + +EIGEN_STRONG_INLINE Packet2Xf half2float(const PacketXh& a) { + return __riscv_vfwcvt_f_f_v_f32m2(a, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE PacketXh float2half(const Packet2Xf& a) { + return __riscv_vfncvt_f_f_w_f16m1(a, unpacket_traits::size); +} + +/********************************* Packet2Xh ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet2Xh ptrue(const Packet2Xh& /*a*/) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(0xffffu, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pzero(const Packet2Xh& /*a*/) { + return __riscv_vfmv_v_f_f16m2(static_cast(0.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pabs(const Packet2Xh& a) { + return __riscv_vfabs_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pset1(const Eigen::half& from) { + return __riscv_vfmv_v_f_f16m2(static_cast<_Float16>(from), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pset1frombits(numext::uint16_t from) { + return __riscv_vreinterpret_f16m2(__riscv_vmv_v_x_u16m2(from, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh plset(const Eigen::half& a) { + Packet2Xh idx = __riscv_vfcvt_f_x_v_f16m2(__riscv_vid_v_i16m2(unpacket_traits::size), + unpacket_traits::size); + return __riscv_vfadd_vf_f16m2(idx, a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh padd(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfadd_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh psub(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfsub_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pnegate(const Packet2Xh& a) { + return __riscv_vfneg_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pconj(const Packet2Xh& a) { + return a; +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pmul(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfmul_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pdiv(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfdiv_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pmadd(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) { + return __riscv_vfmadd_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pmsub(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) { + return __riscv_vfmsub_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pnmadd(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) { + return __riscv_vfnmsub_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pnmsub(const Packet2Xh& a, const Packet2Xh& b, const Packet2Xh& c) { + return __riscv_vfnmadd_vv_f16m2(a, b, c, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pmin(const Packet2Xh& a, const Packet2Xh& b) { + Packet2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmin_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pmin(const Packet2Xh& a, const Packet2Xh& b) { + return pmin(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pmin(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfmin_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pmax(const Packet2Xh& a, const Packet2Xh& b) { + Packet2Xh nans = + __riscv_vfmv_v_f_f16m2((std::numeric_limits::quiet_NaN)(), unpacket_traits::size); + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, a, unpacket_traits::size); + PacketMask8 mask2 = __riscv_vmfeq_vv_f16m2_b8(b, b, unpacket_traits::size); + mask = __riscv_vmand_mm_b8(mask, mask2, unpacket_traits::size); + + return __riscv_vfmax_vv_f16m2_tum(mask, nans, a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pmax(const Packet2Xh& a, const Packet2Xh& b) { + return pmax(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pmax(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vfmax_vv_f16m2(a, b, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pcmp_le(const Packet2Xh& a, const Packet2Xh& b) { + PacketMask8 mask = __riscv_vmfle_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pcmp_lt(const Packet2Xh& a, const Packet2Xh& b) { + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pcmp_eq(const Packet2Xh& a, const Packet2Xh& b) { + PacketMask8 mask = __riscv_vmfeq_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(pzero(a), ptrue(a), mask, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pcmp_lt_or_nan(const Packet2Xh& a, const Packet2Xh& b) { + PacketMask8 mask = __riscv_vmfge_vv_f16m2_b8(a, b, unpacket_traits::size); + return __riscv_vfmerge_vfm_f16m2(ptrue(a), static_cast(0.0), mask, + unpacket_traits::size); +} + +// Logical Operations are not supported for half, so reinterpret casts +template <> +EIGEN_STRONG_INLINE Packet2Xh pand(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh por(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pxor(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vxor_vv_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vreinterpret_v_f16m2_u16m2(b), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pandnot(const Packet2Xh& a, const Packet2Xh& b) { + return __riscv_vreinterpret_v_u16m2_f16m2(__riscv_vand_vv_u16m2( + __riscv_vreinterpret_v_f16m2_u16m2(a), + __riscv_vnot_v_u16m2(__riscv_vreinterpret_v_f16m2_u16m2(b), unpacket_traits::size), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pload(const Eigen::half* from) { + EIGEN_DEBUG_ALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh ploadu(const Eigen::half* from) { + EIGEN_DEBUG_UNALIGNED_LOAD return __riscv_vle16_v_f16m2(reinterpret_cast(from), + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh ploaddup(const Eigen::half* from) { + Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vand_vx_u16m2(idx, 0xfffeu, unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh ploadquad(const Eigen::half* from) { + Packet2Xsu idx = __riscv_vid_v_u16m2(unpacket_traits::size); + idx = __riscv_vsrl_vx_u16m2(__riscv_vand_vx_u16m2(idx, 0xfffcu, unpacket_traits::size), 1, + unpacket_traits::size); + return __riscv_vloxei16_v_f16m2(reinterpret_cast(from), idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const Packet2Xh& from) { + EIGEN_DEBUG_ALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const Packet2Xh& from) { + EIGEN_DEBUG_UNALIGNED_STORE __riscv_vse16_v_f16m2(reinterpret_cast<_Float16*>(to), from, + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline Packet2Xh pgather(const Eigen::half* from, Index stride) { + return __riscv_vlse16_v_f16m2(reinterpret_cast(from), stride * sizeof(Eigen::half), + unpacket_traits::size); +} + +template <> +EIGEN_DEVICE_FUNC inline void pscatter(Eigen::half* to, const Packet2Xh& from, + Index stride) { + __riscv_vsse16(reinterpret_cast<_Float16*>(to), stride * sizeof(Eigen::half), from, + unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half pfirst(const Packet2Xh& a) { + return static_cast(__riscv_vfmv_f_s_f16m2_f16(a)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh psqrt(const Packet2Xh& a) { + return __riscv_vfsqrt_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh print(const Packet2Xh& a) { + const Packet2Xh limit = pset1(static_cast(1 << 10)); + const Packet2Xh abs_a = pabs(a); + + PacketMask8 mask = __riscv_vmfne_vv_f16m2_b8(a, a, unpacket_traits::size); + const Packet2Xh x = __riscv_vfadd_vv_f16m2_tum(mask, a, a, a, unpacket_traits::size); + const Packet2Xh new_x = __riscv_vfcvt_f_x_v_f16m2( + __riscv_vfcvt_x_f_v_i16m2(a, unpacket_traits::size), unpacket_traits::size); + + mask = __riscv_vmflt_vv_f16m2_b8(abs_a, limit, unpacket_traits::size); + Packet2Xh signed_x = __riscv_vfsgnj_vv_f16m2(new_x, x, unpacket_traits::size); + return __riscv_vmerge_vvm_f16m2(x, signed_x, mask, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pfloor(const Packet2Xh& a) { + Packet2Xh tmp = print(a); + // If greater, subtract one. + PacketMask8 mask = __riscv_vmflt_vv_f16m2_b8(a, tmp, unpacket_traits::size); + return __riscv_vfsub_vf_f16m2_tum(mask, tmp, tmp, static_cast(1.0), unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh preverse(const Packet2Xh& a) { + Packet2Xsu idx = + __riscv_vrsub_vx_u16m2(__riscv_vid_v_u16m2(unpacket_traits::size), + unpacket_traits::size - 1, unpacket_traits::size); + return __riscv_vrgather_vv_f16m2(a, idx, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux(const Packet2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredusum_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1(static_cast(0.0), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_mul(const Packet2Xh& a) { + return predux_mul(__riscv_vfmul_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), + unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_min(const Packet2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmin_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1((std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template <> +EIGEN_STRONG_INLINE Eigen::half predux_max(const Packet2Xh& a) { + return static_cast(__riscv_vfmv_f(__riscv_vfredmax_vs_f16m2_f16m1( + a, __riscv_vfmv_v_f_f16m1(-(std::numeric_limits::max)(), unpacket_traits::size / 4), + unpacket_traits::size))); +} + +template +EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { + Eigen::half buffer[unpacket_traits::size * N]; + int i = 0; + + for (i = 0; i < N; i++) { + __riscv_vsse16(reinterpret_cast<_Float16*>(&buffer[i]), N * sizeof(Eigen::half), kernel.packet[i], + unpacket_traits::size); + } + + for (i = 0; i < N; i++) { + kernel.packet[i] = + __riscv_vle16_v_f16m2(reinterpret_cast<_Float16*>(&buffer[i * unpacket_traits::size]), + unpacket_traits::size); + } +} + +EIGEN_STRONG_INLINE Packet4Xf half2float(const Packet2Xh& a) { + return __riscv_vfwcvt_f_f_v_f32m4(a, unpacket_traits::size); +} + +EIGEN_STRONG_INLINE Packet2Xh float2half(const Packet4Xf& a) { + return __riscv_vfncvt_f_f_w_f16m2(a, unpacket_traits::size); +} + +template +EIGEN_STRONG_INLINE +typename std::enable_if::value && (unpacket_traits::size % 8) == 0, + PacketXh>::type +predux_half_dowto4(const Packet2Xh& a) { + return __riscv_vfadd_vv_f16m1(__riscv_vget_v_f16m2_f16m1(a, 0), __riscv_vget_v_f16m2_f16m1(a, 1), + unpacket_traits::size); +} + +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, pcos) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, pexp) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, pexpm1) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, plog) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, plog1p) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, plog2) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, preciprocal) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, prsqrt) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, psin) +F16_PACKET_FUNCTION(Packet2Xf, PacketXh, ptanh) + +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pcos) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pexp) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, pexpm1) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, plog) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, plog1p) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, plog2) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, preciprocal) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, prsqrt) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, psin) +F16_PACKET_FUNCTION(Packet4Xf, Packet2Xh, ptanh) + +/********************************* casting ************************************/ + +template <> +struct type_casting_traits<_Float16, numext::int16_t> { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE PacketXh pcast(const PacketXs& a) { + return __riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXs pcast(const PacketXh& a) { + return __riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE PacketXh preinterpret(const PacketXs& a) { + return __riscv_vreinterpret_v_i16m1_f16m1(a); +} + +template <> +EIGEN_STRONG_INLINE PacketXs preinterpret(const PacketXh& a) { + return __riscv_vreinterpret_v_f16m1_i16m1(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pcast(const Packet2Xs& a) { + return __riscv_vfcvt_f_x_v_f16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pcast(const Packet2Xh& a) { + return __riscv_vfcvt_rtz_x_f_v_i16m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh preinterpret(const Packet2Xs& a) { + return __riscv_vreinterpret_v_i16m2_f16m2(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs preinterpret(const Packet2Xh& a) { + return __riscv_vreinterpret_v_f16m2_i16m2(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pcast(const PacketXh& a, const PacketXh& b, const PacketXh& c, + const PacketXh& d) { + return __riscv_vcreate_v_i16m1_i16m4(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pcast(const PacketXs& a, const PacketXs& b) { + return __riscv_vcreate_v_f16m1_f16m2(__riscv_vfcvt_f_x_v_f16m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f16m1(b, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xh pcast(const PacketXh& a, const PacketXh& b) { + return __riscv_vcreate_v_f16m1_f16m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xs pcast(const PacketXh& a, const PacketXh& b) { + return __riscv_vcreate_v_i16m1_i16m2(__riscv_vfcvt_rtz_x_f_v_i16m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i16m1(b, unpacket_traits::size)); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_PACKET_MATH_FP16_RVV10_H diff --git a/Eigen/src/Core/arch/RVV10/TypeCasting.h b/Eigen/src/Core/arch/RVV10/TypeCasting.h new file mode 100644 index 0000000000000000000000000000000000000000..2b0d3db47b6c55b7d39e2d3d816ff29fcdc7dd1f --- /dev/null +++ b/Eigen/src/Core/arch/RVV10/TypeCasting.h @@ -0,0 +1,284 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2024 Kseniya Zaytseva +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_TYPE_CASTING_RVV10_H +#define EIGEN_TYPE_CASTING_RVV10_H + +// IWYU pragma: private +#include "../../InternalHeaderCheck.h" + +namespace Eigen { +namespace internal { + +/********************************* 32 bits ************************************/ + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet1Xf pcast(const Packet1Xi& a) { + return __riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi pcast(const Packet1Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xf preinterpret(const Packet1Xi& a) { + return __riscv_vreinterpret_v_i32m1_f32m1(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xi preinterpret(const Packet1Xf& a) { + return __riscv_vreinterpret_v_f32m1_i32m1(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pcast(const Packet4Xi& a) { + return __riscv_vfcvt_f_x_v_f32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pcast(const Packet4Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf preinterpret(const Packet4Xi& a) { + return __riscv_vreinterpret_v_i32m4_f32m4(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi preinterpret(const Packet4Xf& a) { + return __riscv_vreinterpret_v_f32m4_i32m4(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pcast(const Packet2Xi& a) { + return __riscv_vfcvt_f_x_v_f32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pcast(const Packet2Xf& a) { + return __riscv_vfcvt_rtz_x_f_v_i32m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf preinterpret(const Packet2Xi& a) { + return __riscv_vreinterpret_v_i32m2_f32m2(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi preinterpret(const Packet2Xf& a) { + return __riscv_vreinterpret_v_f32m2_i32m2(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pcast(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c, + const Packet1Xi& d) { + return __riscv_vcreate_v_i32m1_i32m4(a, b, c, d); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pcast(const Packet1Xi& a, const Packet1Xi& b, const Packet1Xi& c, + const Packet1Xi& d) { + return __riscv_vcreate_v_f32m1_f32m4(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xf pcast(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c, + const Packet1Xf& d) { + return __riscv_vcreate_v_f32m1_f32m4(a, b, c, d); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xi pcast(const Packet1Xf& a, const Packet1Xf& b, const Packet1Xf& c, + const Packet1Xf& d) { + return __riscv_vcreate_v_i32m1_i32m4(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pcast(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vcreate_v_i32m1_i32m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pcast(const Packet1Xi& a, const Packet1Xi& b) { + return __riscv_vcreate_v_f32m1_f32m2(__riscv_vfcvt_f_x_v_f32m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f32m1(b, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xf pcast(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vcreate_v_f32m1_f32m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xi pcast(const Packet1Xf& a, const Packet1Xf& b) { + return __riscv_vcreate_v_i32m1_i32m2(__riscv_vfcvt_rtz_x_f_v_i32m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i32m1(b, unpacket_traits::size)); +} + +/********************************* 64 bits ************************************/ + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +struct type_casting_traits { + enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 }; +}; + +template <> +EIGEN_STRONG_INLINE Packet1Xd pcast(const Packet1Xl& a) { + return __riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl pcast(const Packet1Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xd preinterpret(const Packet1Xl& a) { + return __riscv_vreinterpret_v_i64m1_f64m1(a); +} + +template <> +EIGEN_STRONG_INLINE Packet1Xl preinterpret(const Packet1Xd& a) { + return __riscv_vreinterpret_v_f64m1_i64m1(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pcast(const Packet4Xl& a) { + return __riscv_vfcvt_f_x_v_f64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pcast(const Packet4Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m4(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd preinterpret(const Packet4Xl& a) { + return __riscv_vreinterpret_v_i64m4_f64m4(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl preinterpret(const Packet4Xd& a) { + return __riscv_vreinterpret_v_f64m4_i64m4(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pcast(const Packet2Xl& a) { + return __riscv_vfcvt_f_x_v_f64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pcast(const Packet2Xd& a) { + return __riscv_vfcvt_rtz_x_f_v_i64m2(a, unpacket_traits::size); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd preinterpret(const Packet2Xl& a) { + return __riscv_vreinterpret_v_i64m2_f64m2(a); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl preinterpret(const Packet2Xd& a) { + return __riscv_vreinterpret_v_f64m2_i64m2(a); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pcast(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c, + const Packet1Xl& d) { + return __riscv_vcreate_v_i64m1_i64m4(a, b, c, d); + ; +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pcast(const Packet1Xl& a, const Packet1Xl& b, const Packet1Xl& c, + const Packet1Xl& d) { + return __riscv_vcreate_v_f64m1_f64m4(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(c, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xd pcast(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c, + const Packet1Xd& d) { + return __riscv_vcreate_v_f64m1_f64m4(a, b, c, d); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xl pcast(const Packet1Xd& a, const Packet1Xd& b, const Packet1Xd& c, + const Packet1Xd& d) { + return __riscv_vcreate_v_i64m1_i64m4(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(c, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(d, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pcast(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vcreate_v_i64m1_i64m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pcast(const Packet1Xl& a, const Packet1Xl& b) { + return __riscv_vcreate_v_f64m1_f64m2(__riscv_vfcvt_f_x_v_f64m1(a, unpacket_traits::size), + __riscv_vfcvt_f_x_v_f64m1(b, unpacket_traits::size)); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xd pcast(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vcreate_v_f64m1_f64m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet2Xl pcast(const Packet1Xd& a, const Packet1Xd& b) { + return __riscv_vcreate_v_i64m1_i64m2(__riscv_vfcvt_rtz_x_f_v_i64m1(a, unpacket_traits::size), + __riscv_vfcvt_rtz_x_f_v_i64m1(b, unpacket_traits::size)); +} + +/********************************* 16 bits ************************************/ + +template <> +EIGEN_STRONG_INLINE Packet2Xs pcast(const Packet1Xs& a, const Packet1Xs& b) { + return __riscv_vcreate_v_i16m1_i16m2(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet4Xs pcast(const Packet1Xs& a, const Packet1Xs& b, const Packet1Xs& c, + const Packet1Xs& d) { + return __riscv_vcreate_v_i16m1_i16m4(a, b, c, d); +} + +} // namespace internal +} // namespace Eigen + +#endif // EIGEN_TYPE_CASTING_RVV10_H diff --git a/Eigen/src/Core/util/ConfigureVectorization.h b/Eigen/src/Core/util/ConfigureVectorization.h index 26d2bcae210cb4d87b60025c6d5ccbc4fc189eae..80ad82ef184de574e0493196c7111fa2c8049c06 100644 --- a/Eigen/src/Core/util/ConfigureVectorization.h +++ b/Eigen/src/Core/util/ConfigureVectorization.h @@ -80,6 +80,8 @@ #define EIGEN_IDEAL_MAX_ALIGN_BYTES 32 #elif defined __HVX__ && (__HVX_LENGTH__ == 128) #define EIGEN_IDEAL_MAX_ALIGN_BYTES 128 +#elif defined(EIGEN_RISCV64_USE_RVV10) +#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64 #else #define EIGEN_IDEAL_MAX_ALIGN_BYTES 16 #endif @@ -116,7 +118,7 @@ // Only static alignment is really problematic (relies on nonstandard compiler extensions), // try to keep heap alignment even when we have to disable static alignment. #if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || \ - EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64) + EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64 || EIGEN_ARCH_RISCV) #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 #else #define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0 @@ -418,14 +420,55 @@ extern "C" { #define EIGEN_VECTORIZE_SVE #include -// Since we depend on knowing SVE vector lengths at compile-time, we need -// to ensure a fixed lengths is set +// Since we depend on knowing SVE vector length at compile-time, we need +// to ensure a fixed length is set #if defined __ARM_FEATURE_SVE_BITS #define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS #else #error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set." #endif +#elif defined(EIGEN_ARCH_RISCV) + +#if defined(__riscv_zfh) +#define EIGEN_HAS_BUILTIN_FLOAT16 +#endif + +// We currently require RVV to be enabled explicitly via EIGEN_RISCV64_USE_RVV and +// will not select the backend automatically +#if (defined EIGEN_RISCV64_USE_RVV10) + +#define EIGEN_VECTORIZE +#define EIGEN_VECTORIZE_RVV10 +#include + +// Since we depend on knowing RVV vector length at compile-time, we need +// to ensure a fixed length is set +#if defined(__riscv_v_fixed_vlen) +#define EIGEN_RISCV64_RVV_VL __riscv_v_fixed_vlen +#if __riscv_v_fixed_vlen >= 256 +#undef EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT +#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1 +#endif +#else +#error "Eigen requires a fixed RVV vector length but -mrvv-vector-bits=zvl is not set." +#endif + +#undef EIGEN_STACK_ALLOCATION_LIMIT +#define EIGEN_STACK_ALLOCATION_LIMIT 196608 + +#if defined(__riscv_zvfh) && defined(__riscv_zfh) +#define EIGEN_VECTORIZE_RVV10FP16 +#elif defined(__riscv_zvfh) +#if defined(__GNUC__) || defined(__clang__) +#warning "The Eigen::Half vectorization requires Zfh and Zvfh extensions." +#elif defined(_MSC_VER) +#pragma message("The Eigen::Half vectorization requires Zfh and Zvfh extensions.") +#endif +#endif + +#endif // defined(EIGEN_ARCH_RISCV) + #elif (defined __s390x__ && defined __VEC__) #define EIGEN_VECTORIZE @@ -510,6 +553,13 @@ extern "C" { #include #endif +#if defined(__riscv) +// Defines the default LMUL for RISC-V +#ifndef EIGEN_RISCV64_DEFAULT_LMUL +#define EIGEN_RISCV64_DEFAULT_LMUL 1 +#endif +#endif + /** \brief Namespace containing all symbols from the %Eigen library. */ // IWYU pragma: private #include "../InternalHeaderCheck.h" diff --git a/Eigen/src/Core/util/Constants.h b/Eigen/src/Core/util/Constants.h index fcc2db82266e54776d6efe1403fa84ae44011225..8aba62b75565ff787b1399b3dde62fb6601a1025 100644 --- a/Eigen/src/Core/util/Constants.h +++ b/Eigen/src/Core/util/Constants.h @@ -475,6 +475,7 @@ enum Type { SVE = 0x6, HVX = 0x7, LSX = 0x8, + RVV10 = 0x9, #if defined EIGEN_VECTORIZE_SSE Target = SSE #elif defined EIGEN_VECTORIZE_ALTIVEC @@ -491,6 +492,8 @@ enum Type { Target = HVX #elif defined EIGEN_VECTORIZE_LSX Target = LSX +#elif defined EIGEN_VECTORIZE_RVV10 + Target = RVV10 #else Target = Generic #endif diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index e644211f3cad3a7ba50a67ffec77169f33c776cc..9cd825085885f84ce2179e78354c06614f2ea279 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -420,6 +420,13 @@ #define EIGEN_ARCH_PPC 0 #endif +/// \internal EIGEN_ARCH_RISCV set to 1 if the architecture is RISC-V. +#if defined(__riscv) +#define EIGEN_ARCH_RISCV 1 +#else +#define EIGEN_ARCH_RISCV 0 +#endif + //------------------------------------------------------------------------------------------ // Operating system identification, EIGEN_OS_* //------------------------------------------------------------------------------------------ @@ -1023,7 +1030,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(cons #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var); #if !defined(EIGEN_ASM_COMMENT) -#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64) +#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_RISCV) #define EIGEN_ASM_COMMENT(X) __asm__("#" X) #else #define EIGEN_ASM_COMMENT(X) diff --git a/Eigen/src/Jacobi/Jacobi.h b/Eigen/src/Jacobi/Jacobi.h index 09bffa4a90c2f0666813e6f8d117c8801e4c3daf..d97477b3384e6af954227dfd84c62bf03efbd2cf 100644 --- a/Eigen/src/Jacobi/Jacobi.h +++ b/Eigen/src/Jacobi/Jacobi.h @@ -305,7 +305,7 @@ struct apply_rotation_in_the_plane_selector::type OtherPacket; constexpr int RequiredAlignment = - (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); + (std::max)(unpacket_traits::alignment, unpacket_traits::alignment); constexpr Index PacketSize = packet_traits::size; /*** dynamic-size vectorized paths ***/